Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce allocations in UTF16 conversion #3113

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions winlogbeat/sys/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,49 @@ import (
"unicode/utf16"
)

// UTF16BytesToString returns the Unicode code point sequence represented
// by the UTF-16 buffer b.
// UTF16BytesToString returns a string that is decoded from the UTF-16 bytes.
// The byte slice must be of even length otherwise an error will be returned.
// The integer returned is the offset to the start of the next string with
// buffer if it exists, otherwise -1 is returned.
func UTF16BytesToString(b []byte) (string, int, error) {
if len(b)%2 != 0 {
return "", 0, fmt.Errorf("Slice must have an even length (length=%d)",
len(b))
return "", 0, fmt.Errorf("Slice must have an even length (length=%d)", len(b))
}

offset := -1

// Find the null terminator if it exists and re-slice the b.
if nullIndex := indexNullTerminator(b); nullIndex > 0 {
if len(b) > nullIndex+2 {
offset = nullIndex + 2
}

b = b[:nullIndex]
}

offset := len(b)/2 + 2
s := make([]uint16, len(b)/2)
for i := range s {
s[i] = uint16(b[i*2]) + uint16(b[(i*2)+1])<<8
}

if s[i] == 0 {
s = s[0:i]
offset = i*2 + 2
break
return string(utf16.Decode(s)), offset, nil
}

// indexNullTerminator returns the index of a null terminator within a buffer
// containing UTF-16 encoded data. If the null terminator is not found -1 is
// returned.
func indexNullTerminator(b []byte) int {
if len(b) < 2 {
return -1
}

for i := 0; i < len(b); i += 2 {
if b[i] == 0 && b[i+1] == 0 {
return i
}
}

return string(utf16.Decode(s)), offset, nil
return -1
}

// RemoveWindowsLineEndings replaces carriage return line feed (CRLF) with
Expand Down
77 changes: 77 additions & 0 deletions winlogbeat/sys/strings_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package sys

import (
"bytes"
"encoding/binary"
"testing"
"unicode/utf16"

"github.com/stretchr/testify/assert"
)

func toUTF16Bytes(in string) []byte {
var u16 []uint16 = utf16.Encode([]rune(in))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume we don't simply use this version because it's slower? Or is there another reason?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The UTF16BytesToString() function gets used for a []byte -> string conversion and there aren't any functions in the stdlib that can do that directly (there's just []uint16 -> string).

buf := &bytes.Buffer{}
binary.Write(buf, binary.LittleEndian, u16)
return buf.Bytes()
}

func TestUTF16BytesToString(t *testing.T) {
input := "abc白鵬翔\u145A6"
utf16Bytes := toUTF16Bytes(input)

output, _, err := UTF16BytesToString(utf16Bytes)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, input, output)
}

func TestUTF16BytesToStringOffset(t *testing.T) {
in := bytes.Join([][]byte{toUTF16Bytes("one"), toUTF16Bytes("two"), toUTF16Bytes("three")}, []byte{0, 0})

output, offset, err := UTF16BytesToString(in)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, "one", output)
assert.Equal(t, 8, offset)

in = in[offset:]
output, offset, err = UTF16BytesToString(in)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, "two", output)
assert.Equal(t, 8, offset)

in = in[offset:]
output, offset, err = UTF16BytesToString(in)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, "three", output)
assert.Equal(t, -1, offset)
}

func BenchmarkUTF16BytesToString(b *testing.B) {
utf16Bytes := toUTF16Bytes("A logon was attempted using explicit credentials.")

b.Run("simple_string", func(b *testing.B) {
b.ResetTimer()

for i := 0; i < b.N; i++ {
UTF16BytesToString(utf16Bytes)
}
})

// Buffer larger than the string.
b.Run("larger_buffer", func(b *testing.B) {
utf16Bytes = append(utf16Bytes, make([]byte, 2048)...)
b.ResetTimer()

for i := 0; i < b.N; i++ {
UTF16BytesToString(utf16Bytes)
}
})
}