Skip to content

Commit

Permalink
[pkg/stanza] Extract trim func from split package (#26536)
Browse files Browse the repository at this point in the history
Follows
#26241

Previously, split funcs were responsible for applying trim funcs. This
PR increases composability by applying trim funcs as a wrapper around
split funcs.

One nuance that was surfaced here is that the newline split func was not
handling the case where a line starts with a newline. When this happens,
we need to tell the scanner to advance, but we still want to return a
`""` token, rather than nil. This is covered by existing tests, but
previously it was "fixed" by the trim func which would return an empty
slice when the token was nil. Now, the newline split func will
explicitly handle this case, while the trim func will return the
original value if it is nil or empty.
  • Loading branch information
djaglowski authored Sep 13, 2023
1 parent a3eacd6 commit 82d0db2
Show file tree
Hide file tree
Showing 16 changed files with 328 additions and 201 deletions.
30 changes: 30 additions & 0 deletions .chloggen/pkg-stanza-extract-trim-split.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: breaking

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: pkg/stanza

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Make trim func composable

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [26536]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: |
- Adds trim.WithFunc to allow trim funcs to wrap bufio.SplitFuncs.
- Removes trim.Func from split.Config.Func. Use trim.WithFunc instead.
- Removes trim.Func from flush.WithPeriod. Use trim.WithFunc instead.
# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [api]
2 changes: 1 addition & 1 deletion pkg/stanza/fileconsumer/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ func (c Config) BuildWithSplitFunc(logger *zap.SugaredLogger, emit emit.Callback
}

// Ensure that splitter is buildable
factory := splitter.NewCustomFactory(splitFunc, c.FlushPeriod)
factory := splitter.NewCustomFactory(splitFunc, c.TrimConfig.Func(), c.FlushPeriod)
if _, err := factory.SplitFunc(); err != nil {
return nil, err
}
Expand Down
10 changes: 7 additions & 3 deletions pkg/stanza/fileconsumer/internal/header/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/operator"
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/pipeline"
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/split"
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
)

type Config struct {
Expand Down Expand Up @@ -69,13 +70,16 @@ func NewConfig(matchRegex string, metadataOperators []operator.Config, enc encod
return nil, fmt.Errorf("failed to compile `pattern`: %w", err)
}

splitFunc, err := split.NewlineSplitFunc(enc, false, func(b []byte) []byte {
return bytes.Trim(b, "\r\n")
})
splitFunc, err := split.NewlineSplitFunc(enc, false)
if err != nil {
return nil, fmt.Errorf("failed to create split func: %w", err)
}

var trimFunc trim.Func = func(b []byte) []byte {
return bytes.Trim(b, "\r\n")
}
splitFunc = trim.WithFunc(splitFunc, trimFunc)

return &Config{
regex: regex,
SplitFunc: splitFunc,
Expand Down
6 changes: 4 additions & 2 deletions pkg/stanza/fileconsumer/internal/splitter/custom.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,21 @@ import (

type customFactory struct {
splitFunc bufio.SplitFunc
trimFunc trim.Func
flushPeriod time.Duration
}

var _ Factory = (*customFactory)(nil)

func NewCustomFactory(splitFunc bufio.SplitFunc, flushPeriod time.Duration) Factory {
func NewCustomFactory(splitFunc bufio.SplitFunc, trimFunc trim.Func, flushPeriod time.Duration) Factory {
return &customFactory{
splitFunc: splitFunc,
trimFunc: trimFunc,
flushPeriod: flushPeriod,
}
}

// SplitFunc builds a bufio.SplitFunc based on the configuration
func (f *customFactory) SplitFunc() (bufio.SplitFunc, error) {
return flush.WithPeriod(f.splitFunc, trim.Nop, f.flushPeriod), nil
return trim.WithFunc(flush.WithPeriod(f.splitFunc, f.flushPeriod), f.trimFunc), nil
}
62 changes: 60 additions & 2 deletions pkg/stanza/fileconsumer/internal/splitter/custom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ import (
"time"

"github.com/stretchr/testify/assert"

"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
)

func TestCustom(t *testing.T) {
factory := NewCustomFactory(bufio.ScanLines, 0)
factory := NewCustomFactory(bufio.ScanLines, trim.Nop, 0)
splitFunc, err := factory.SplitFunc()
assert.NoError(t, err)
assert.NotNil(t, splitFunc)
Expand All @@ -35,9 +37,33 @@ func TestCustom(t *testing.T) {
assert.Nil(t, token)
}

func TestCustomWithTrim(t *testing.T) {
factory := NewCustomFactory(bufio.ScanLines, trim.Whitespace, 0)
splitFunc, err := factory.SplitFunc()
assert.NoError(t, err)
assert.NotNil(t, splitFunc)

input := []byte(" hello \n world \n extra ")

advance, token, err := splitFunc(input, false)
assert.NoError(t, err)
assert.Equal(t, 8, advance)
assert.Equal(t, []byte("hello"), token)

advance, token, err = splitFunc(input[8:], false)
assert.NoError(t, err)
assert.Equal(t, 8, advance)
assert.Equal(t, []byte("world"), token)

advance, token, err = splitFunc(input[16:], false)
assert.NoError(t, err)
assert.Equal(t, 0, advance)
assert.Nil(t, token)
}

func TestCustomWithFlush(t *testing.T) {
flushPeriod := 100 * time.Millisecond
factory := NewCustomFactory(bufio.ScanLines, flushPeriod)
factory := NewCustomFactory(bufio.ScanLines, trim.Nop, flushPeriod)
splitFunc, err := factory.SplitFunc()
assert.NoError(t, err)
assert.NotNil(t, splitFunc)
Expand Down Expand Up @@ -66,3 +92,35 @@ func TestCustomWithFlush(t *testing.T) {
assert.Equal(t, 7, advance)
assert.Equal(t, []byte(" extra "), token)
}

func TestCustomWithFlushTrim(t *testing.T) {
flushPeriod := 100 * time.Millisecond
factory := NewCustomFactory(bufio.ScanLines, trim.Whitespace, flushPeriod)
splitFunc, err := factory.SplitFunc()
assert.NoError(t, err)
assert.NotNil(t, splitFunc)

input := []byte(" hello \n world \n extra ")

advance, token, err := splitFunc(input, false)
assert.NoError(t, err)
assert.Equal(t, 8, advance)
assert.Equal(t, []byte("hello"), token)

advance, token, err = splitFunc(input[8:], false)
assert.NoError(t, err)
assert.Equal(t, 8, advance)
assert.Equal(t, []byte("world"), token)

advance, token, err = splitFunc(input[16:], false)
assert.NoError(t, err)
assert.Equal(t, 0, advance)
assert.Nil(t, token)

time.Sleep(2 * flushPeriod)

advance, token, err = splitFunc(input[16:], false)
assert.NoError(t, err)
assert.Equal(t, 7, advance)
assert.Equal(t, []byte("extra"), token) // Ensure trim applies to flushed token
}
9 changes: 7 additions & 2 deletions pkg/stanza/fileconsumer/internal/splitter/multiline.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@ func NewSplitFuncFactory(

// SplitFunc builds a bufio.SplitFunc based on the configuration
func (f *splitFuncFactory) SplitFunc() (bufio.SplitFunc, error) {
splitFunc, err := f.splitConfig.Func(f.encoding, false, f.maxLogSize, f.trimFunc)
splitFunc, err := f.splitConfig.Func(f.encoding, false, f.maxLogSize)
if err != nil {
return nil, err
}
return flush.WithPeriod(splitFunc, f.trimFunc, f.flushPeriod), nil
splitFunc = flush.WithPeriod(splitFunc, f.flushPeriod)
if f.encoding == encoding.Nop {
// Special case where we should never trim
return splitFunc, nil
}
return trim.WithFunc(splitFunc, f.trimFunc), nil
}
26 changes: 25 additions & 1 deletion pkg/stanza/fileconsumer/internal/splitter/multiline_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,30 @@ func TestSplitFunc(t *testing.T) {
assert.Nil(t, token)
}

func TestSplitFuncWithTrim(t *testing.T) {
factory := NewSplitFuncFactory(split.Config{}, unicode.UTF8, 1024, trim.Whitespace, 0)
splitFunc, err := factory.SplitFunc()
assert.NoError(t, err)
assert.NotNil(t, splitFunc)

input := []byte(" hello \n world \n extra ")

advance, token, err := splitFunc(input, false)
assert.NoError(t, err)
assert.Equal(t, 8, advance)
assert.Equal(t, []byte("hello"), token)

advance, token, err = splitFunc(input[8:], false)
assert.NoError(t, err)
assert.Equal(t, 8, advance)
assert.Equal(t, []byte("world"), token)

advance, token, err = splitFunc(input[16:], false)
assert.NoError(t, err)
assert.Equal(t, 0, advance)
assert.Nil(t, token)
}

func TestSplitFuncWithFlush(t *testing.T) {
flushPeriod := 100 * time.Millisecond
factory := NewSplitFuncFactory(split.Config{}, unicode.UTF8, 1024, trim.Nop, flushPeriod)
Expand Down Expand Up @@ -81,7 +105,7 @@ func TestSplitFuncWithFlush(t *testing.T) {
assert.Equal(t, []byte(" extra "), token)
}

func TestSplitFuncWithTrim(t *testing.T) {
func TestSplitFuncWithFlushTrim(t *testing.T) {
flushPeriod := 100 * time.Millisecond
factory := NewSplitFuncFactory(split.Config{}, unicode.UTF8, 1024, trim.Whitespace, flushPeriod)
splitFunc, err := factory.SplitFunc()
Expand Down
10 changes: 4 additions & 6 deletions pkg/stanza/flush/flush.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@ package flush // import "github.com/open-telemetry/opentelemetry-collector-contr
import (
"bufio"
"time"

"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
)

// Wrap a bufio.SplitFunc with a flusher
func WithPeriod(splitFunc bufio.SplitFunc, trimFunc trim.Func, period time.Duration) bufio.SplitFunc {
func WithPeriod(splitFunc bufio.SplitFunc, period time.Duration) bufio.SplitFunc {
if period <= 0 {
return splitFunc
}
Expand All @@ -20,7 +18,7 @@ func WithPeriod(splitFunc bufio.SplitFunc, trimFunc trim.Func, period time.Durat
forcePeriod: period,
previousDataLength: 0,
}
return f.splitFunc(splitFunc, trimFunc)
return f.splitFunc(splitFunc)
}

// flusher keeps information about flush state
Expand Down Expand Up @@ -61,7 +59,7 @@ func (f *flusher) shouldFlush() bool {
return f.forcePeriod > 0 && time.Since(f.lastDataChange) > f.forcePeriod && f.previousDataLength > 0
}

func (f *flusher) splitFunc(splitFunc bufio.SplitFunc, trimFunc trim.Func) bufio.SplitFunc {
func (f *flusher) splitFunc(splitFunc bufio.SplitFunc) bufio.SplitFunc {
return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
advance, token, err = splitFunc(data, atEOF)

Expand All @@ -81,7 +79,7 @@ func (f *flusher) splitFunc(splitFunc bufio.SplitFunc, trimFunc trim.Func) bufio
if f.shouldFlush() {
// Inform flusher that we just flushed
f.flushed()
token = trimFunc(data)
token = data
advance = len(data)
return
}
Expand Down
6 changes: 2 additions & 4 deletions pkg/stanza/flush/flush_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ import (
"time"

"github.com/stretchr/testify/assert"

"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/trim"
)

func TestFlusher(t *testing.T) {
Expand All @@ -22,7 +20,7 @@ func TestFlusher(t *testing.T) {
// always use atEOF=false.

flushPeriod := 100 * time.Millisecond
f := WithPeriod(bufio.ScanWords, trim.Nop, flushPeriod)
f := WithPeriod(bufio.ScanWords, flushPeriod)

content := []byte("foo bar hellowo")

Expand Down Expand Up @@ -64,7 +62,7 @@ func TestNoFlushPeriod(t *testing.T) {
// In other words, we should expect exactly the behavior of bufio.ScanWords.

flushPeriod := time.Duration(0)
f := WithPeriod(bufio.ScanWords, trim.Nop, flushPeriod)
f := WithPeriod(bufio.ScanWords, flushPeriod)

content := []byte("foo bar hellowo")

Expand Down
12 changes: 4 additions & 8 deletions pkg/stanza/operator/input/tcp/tcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,8 @@ type BaseConfig struct {

type SplitFuncBuilder func(enc encoding.Encoding) (bufio.SplitFunc, error)

func (c Config) defaultMultilineBuilder(enc encoding.Encoding) (bufio.SplitFunc, error) {
trimFunc := c.TrimConfig.Func()
splitFunc, err := c.SplitConfig.Func(enc, true, int(c.MaxLogSize), trimFunc)
if err != nil {
return nil, err
}
return splitFunc, nil
func (c Config) defaultSplitFuncBuilder(enc encoding.Encoding) (bufio.SplitFunc, error) {
return c.SplitConfig.Func(enc, true, int(c.MaxLogSize))
}

// Build will build a tcp input operator.
Expand Down Expand Up @@ -121,14 +116,15 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) {
}

if c.SplitFuncBuilder == nil {
c.SplitFuncBuilder = c.defaultMultilineBuilder
c.SplitFuncBuilder = c.defaultSplitFuncBuilder
}

// Build split func
splitFunc, err := c.SplitFuncBuilder(enc)
if err != nil {
return nil, err
}
splitFunc = trim.WithFunc(splitFunc, c.TrimConfig.Func())

var resolver *helper.IPResolver
if c.AddAttributes {
Expand Down
5 changes: 3 additions & 2 deletions pkg/stanza/operator/input/udp/udp.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,12 @@ func (c Config) Build(logger *zap.SugaredLogger) (operator.Operator, error) {
return nil, err
}

// Build SplitFunc
splitFunc, err := c.SplitConfig.Func(enc, true, MaxUDPSize, c.TrimConfig.Func())
// Build split func
splitFunc, err := c.SplitConfig.Func(enc, true, MaxUDPSize)
if err != nil {
return nil, err
}
splitFunc = trim.WithFunc(splitFunc, c.TrimConfig.Func())

var resolver *helper.IPResolver
if c.AddAttributes {
Expand Down
Loading

0 comments on commit 82d0db2

Please sign in to comment.