-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature: Dissect processor #6925
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,3 +115,80 @@ def test_condition(self): | |
assert "beat.name" in output | ||
assert "message" in output | ||
assert "test" in output["message"] | ||
|
||
def test_dissect_good_tokenizer(self): | ||
""" | ||
Check dissect with a good tokenizer | ||
""" | ||
self.render_config_template( | ||
path=os.path.abspath(self.working_dir) + "/test.log", | ||
processors=[{ | ||
"dissect": { | ||
"tokenizer": "\"%{key} world\"", | ||
"field": "message", | ||
"target_prefix": "extracted" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you leave this one out, you can test the defaults. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have added something specific to test all the defaults instead, I prefer clearer intend. |
||
}, | ||
}] | ||
) | ||
with open(self.working_dir + "/test.log", "w") as f: | ||
f.write("Hello world\n") | ||
|
||
filebeat = self.start_beat() | ||
self.wait_until(lambda: self.output_has(lines=1)) | ||
filebeat.check_kill_and_wait() | ||
|
||
output = self.read_output( | ||
required_fields=["@timestamp"], | ||
)[0] | ||
assert output["extracted.key"] == "Hello" | ||
|
||
def test_dissect_defaults(self): | ||
""" | ||
Check dissect defaults | ||
""" | ||
self.render_config_template( | ||
path=os.path.abspath(self.working_dir) + "/test.log", | ||
processors=[{ | ||
"dissect": { | ||
"tokenizer": "\"%{key} world\"", | ||
}, | ||
}] | ||
) | ||
with open(self.working_dir + "/test.log", "w") as f: | ||
f.write("Hello world\n") | ||
|
||
filebeat = self.start_beat() | ||
self.wait_until(lambda: self.output_has(lines=1)) | ||
filebeat.check_kill_and_wait() | ||
|
||
output = self.read_output( | ||
required_fields=["@timestamp"], | ||
)[0] | ||
assert output["dissect.key"] == "Hello" | ||
|
||
def test_dissect_bad_tokenizer(self): | ||
""" | ||
Check dissect with a bad tokenizer | ||
""" | ||
self.render_config_template( | ||
path=os.path.abspath(self.working_dir) + "/test.log", | ||
processors=[{ | ||
"dissect": { | ||
"tokenizer": "\"not %{key} world\"", | ||
"field": "message", | ||
"target_prefix": "extracted" | ||
}, | ||
}] | ||
) | ||
with open(self.working_dir + "/test.log", "w") as f: | ||
f.write("Hello world\n") | ||
|
||
filebeat = self.start_beat() | ||
self.wait_until(lambda: self.output_has(lines=1)) | ||
filebeat.check_kill_and_wait() | ||
|
||
output = self.read_output( | ||
required_fields=["@timestamp"], | ||
)[0] | ||
assert "extracted.key" not in output | ||
assert output["message"] == "Hello world" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ The supported processors are: | |
* <<add-kubernetes-metadata,`add_kubernetes_metadata`>> | ||
* <<add-docker-metadata,`add_docker_metadata`>> | ||
* <<add-host-metadata,`add_host_metadata`>> | ||
* <<dissect, `dissect`>> | ||
|
||
[[conditions]] | ||
==== Conditions | ||
|
@@ -761,3 +762,34 @@ The fields added to the event are looking as following: | |
------------------------------------------------------------------------------- | ||
|
||
NOTE: The host information is refreshed every 5 minutes. | ||
|
||
[[dissect]] | ||
=== Dissect strings | ||
|
||
The dissect processor tokenizes incoming strings using defined patterns. | ||
|
||
[source,yaml] | ||
------- | ||
processors: | ||
- dissect: | ||
tokenizer: "%{key1} %{key2}" | ||
field: "message" | ||
target_prefix: "dissect" | ||
------- | ||
|
||
The `dissect` processor has the following configuration settings: | ||
|
||
`field`:: (Optional) The event field to tokenize. Default is `message`. | ||
|
||
`target_prefix`:: (Optional) The name of the field where the values will be extracted. When an empty | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you document on what happens if |
||
string is defined, the processor will create the keys at the root of the event. Default is | ||
`dissect`. When the target key already exists in the event, the processor won't replace it and log | ||
an error; you need to either drop or rename the key before using dissect. | ||
|
||
For tokenization to be successful, all keys must be found and extracted, if one of them cannot be | ||
found an error will be logged and no modification is done on the original event. | ||
|
||
NOTE: A key can contain any characters except reserved suffix or prefix modifiers: `/`,`&`, `+` | ||
and `?`. | ||
|
||
See <<conditions>> for a list of supported conditions. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
dissect_tests.json |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
package dissect | ||
|
||
type config struct { | ||
Tokenizer *tokenizer `config:"tokenizer"` | ||
Field string `config:"field"` | ||
TargetPrefix string `config:"target_prefix"` | ||
} | ||
|
||
var defaultConfig = config{ | ||
Field: "message", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on elastic/ecs#3 this could become log.message (not now) |
||
TargetPrefix: "dissect", | ||
} | ||
|
||
// tokenizer add validation at the unpack level for this specific field. | ||
type tokenizer = Dissector | ||
|
||
// Unpack a tokenizer into a dissector this will trigger the normal validation of the dissector. | ||
func (t *tokenizer) Unpack(v string) error { | ||
d, err := New(v) | ||
if err != nil { | ||
return err | ||
} | ||
*t = *d | ||
return nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package dissect | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
|
||
"github.com/elastic/beats/libbeat/common" | ||
) | ||
|
||
func TestTokenizerType(t *testing.T) { | ||
t.Run("valid", func(t *testing.T) { | ||
c, err := common.NewConfigFrom(map[string]interface{}{ | ||
"tokenizer": "%{value1}", | ||
"field": "message", | ||
}) | ||
if !assert.NoError(t, err) { | ||
return | ||
} | ||
|
||
cfg := config{} | ||
err = c.Unpack(&cfg) | ||
if !assert.NoError(t, err) { | ||
return | ||
} | ||
}) | ||
|
||
t.Run("invalid", func(t *testing.T) { | ||
c, err := common.NewConfigFrom(map[string]interface{}{ | ||
"tokenizer": "%value1}", | ||
"field": "message", | ||
}) | ||
if !assert.NoError(t, err) { | ||
return | ||
} | ||
|
||
cfg := config{} | ||
err = c.Unpack(&cfg) | ||
if !assert.Error(t, err) { | ||
return | ||
} | ||
}) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package dissect | ||
|
||
import ( | ||
"errors" | ||
"regexp" | ||
) | ||
|
||
var ( | ||
// delimiterRE tokenizes the following string into walkable with extracted delimiter + key. | ||
// string: | ||
// ` %{key}, %{key/2}` | ||
// into: | ||
// [["", "key" ], [", ", "key/2"]] | ||
delimiterRE = regexp.MustCompile("(?s)(.*?)%\\{([^}]*?)}") | ||
suffixRE = regexp.MustCompile("(.+?)(/(\\d{1,2}))?(->)?$") | ||
|
||
skipFieldPrefix = "?" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we really need to support all of these prefixes or if we can leave the more complex ones to LS? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the syntax is still quite small I prefer we keep the two implementations as close as possible. If we ever make change or additions in the syntax we might discuss it again. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also the indirect is really useful, take this example:
|
||
appendFieldPrefix = "+" | ||
indirectFieldPrefix = "&" | ||
appendIndirectPrefix = "+&" | ||
indirectAppendPrefix = "&+" | ||
greedySuffix = "->" | ||
|
||
defaultJoinString = " " | ||
|
||
errParsingFailure = errors.New("parsing failure") | ||
errInvalidTokenizer = errors.New("invalid dissect tokenizer") | ||
errEmpty = errors.New("empty string provided") | ||
errMixedPrefixIndirectAppend = errors.New("mixed prefix `&+`") | ||
errMixedPrefixAppendIndirect = errors.New("mixed prefix `&+`") | ||
errEmptyKey = errors.New("empty key") | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we have some guidelines / requirements on what users should use for the
keys
? Like don't use spaces for example? As dissect allows any match to be a key, this will become interesting.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
space is fine, added a note about suffix and prefix modifiers.