Skip to content

Commit

Permalink
Import aditional indexing settings on external fields (#752)
Browse files Browse the repository at this point in the history
Fields imported from ECS may contain some settings that may affect
how they are indexed:
* `multi_fields` allow to index the same field in different ways for different
  purposes.
* `index: false` disables indexing of the field.
* `doc_values: false` disables storing some internal data used when sorting
  and by some aggregations, this can help to save disk space with the cost of losing some functionality for those fields.
  • Loading branch information
jsoriano authored Mar 29, 2022
1 parent 9c95e81 commit 4845791
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 20 deletions.
8 changes: 8 additions & 0 deletions internal/docs/exported_fields.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,14 @@ func visitFields(namePrefix string, f fields.FieldDefinition, records []fieldsTa
unit: f.Unit,
metricType: f.MetricType,
})

for _, multiField := range f.MultiFields {
records = append(records, fieldsTableRecord{
name: name + "." + multiField.Name,
description: fmt.Sprintf("Multi-field of %#q.", name),
aType: multiField.Type,
})
}
return records, nil
}

Expand Down
27 changes: 24 additions & 3 deletions internal/fields/dependency_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,21 @@ func buildFieldPath(root string, field common.MapStr) string {

func transformImportedField(fd FieldDefinition) common.MapStr {
m := common.MapStr{
"name": fd.Name,
"description": fd.Description,
"type": fd.Type,
"name": fd.Name,
"type": fd.Type,
}

// Multi-fields don't have descriptions.
if fd.Description != "" {
m["description"] = fd.Description
}

if fd.Index != nil {
m["index"] = *fd.Index
}

if fd.DocValues != nil {
m["doc_values"] = *fd.DocValues
}

if len(fd.Fields) > 0 {
Expand All @@ -221,5 +233,14 @@ func transformImportedField(fd FieldDefinition) common.MapStr {
}
m.Put("fields", t)
}

if len(fd.MultiFields) > 0 {
var t []common.MapStr
for _, f := range fd.MultiFields {
i := transformImportedField(f)
t = append(t, i)
}
m.Put("multi_fields", t)
}
return m
}
84 changes: 84 additions & 0 deletions internal/fields/dependency_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,71 @@ func TestDependencyManagerInjectExternalFields(t *testing.T) {
changed: true,
valid: true,
},
{
title: "multi fields",
defs: []common.MapStr{
{
"name": "process.command_line",
"external": "test",
},
},
result: []common.MapStr{
{
"name": "process.command_line",
"type": "wildcard",
"description": "Full command line that started the process.",
"multi_fields": []common.MapStr{
{
"name": "text",
"type": "match_only_text",
},
},
},
},
changed: true,
valid: true,
},
{
title: "not indexed external",
defs: []common.MapStr{
{
"name": "event.original",
"external": "test",
},
},
result: []common.MapStr{
{
"name": "event.original",
"type": "text",
"description": "Original event.",
"index": false,
"doc_values": false,
},
},
changed: true,
valid: true,
},
{
title: "override not indexed external",
defs: []common.MapStr{
{
"name": "event.original",
"index": true,
"external": "test",
},
},
result: []common.MapStr{
{
"name": "event.original",
"type": "text",
"description": "Original event.",
"index": true,
"doc_values": false,
},
},
changed: true,
valid: true,
},
{
title: "unknown field",
defs: []common.MapStr{
Expand All @@ -128,6 +193,7 @@ func TestDependencyManagerInjectExternalFields(t *testing.T) {
},
}

indexFalse := false
schema := map[string][]FieldDefinition{"test": []FieldDefinition{
{
Name: "container.id",
Expand All @@ -144,6 +210,24 @@ func TestDependencyManagerInjectExternalFields(t *testing.T) {
Description: "Data stream dataset.",
Type: "constant_keyword",
},
{
Name: "process.command_line",
Description: "Full command line that started the process.",
Type: "wildcard",
MultiFields: []FieldDefinition{
{
Name: "text",
Type: "match_only_text",
},
},
},
{
Name: "event.original",
Description: "Original event.",
Type: "text",
Index: &indexFalse,
DocValues: &indexFalse,
},
}}
dm := &DependencyManager{schema: schema}

Expand Down
51 changes: 34 additions & 17 deletions internal/fields/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ type FieldDefinition struct {
Unit string `yaml:"unit"`
MetricType string `yaml:"metric_type"`
External string `yaml:"external"`
Fields []FieldDefinition `yaml:"fields"`
Index *bool `yaml:"index"`
DocValues *bool `yaml:"doc_values"`
Fields []FieldDefinition `yaml:"fields,omitempty"`
MultiFields []FieldDefinition `yaml:"multi_fields,omitempty"`
}

func (orig *FieldDefinition) Update(fd FieldDefinition) {
Expand Down Expand Up @@ -42,26 +45,40 @@ func (orig *FieldDefinition) Update(fd FieldDefinition) {
if fd.External != "" {
orig.External = fd.External
}
if fd.Index != nil {
orig.Index = fd.Index
}
if fd.DocValues != nil {
orig.DocValues = fd.DocValues
}

if len(fd.Fields) > 0 {
// When a subfield the same name exists, update it. When not, append it.
updatedFields := make([]FieldDefinition, len(orig.Fields))
copy(updatedFields, orig.Fields)
for _, newField := range fd.Fields {
found := false
for i, origField := range orig.Fields {
if origField.Name != newField.Name {
continue
}
orig.Fields = updateFields(orig.Fields, fd.Fields)
}

found = true
updatedFields[i].Update(newField)
break
}
if !found {
updatedFields = append(updatedFields, newField)
if len(fd.MultiFields) > 0 {
orig.MultiFields = updateFields(orig.MultiFields, fd.MultiFields)
}
}

func updateFields(origFields, fields []FieldDefinition) []FieldDefinition {
// When a subfield the same name exists, update it. When not, append it.
updatedFields := make([]FieldDefinition, len(origFields))
copy(updatedFields, origFields)
for _, newField := range fields {
found := false
for i, origField := range origFields {
if origField.Name != newField.Name {
continue
}

found = true
updatedFields[i].Update(newField)
break
}
if !found {
updatedFields = append(updatedFields, newField)
}
orig.Fields = updatedFields
}
return updatedFields
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
name: event.created
- external: ecs
name: event.kind
- external: ecs
name: event.original
- external: ecs
name: event.outcome
- external: ecs
Expand All @@ -30,6 +32,8 @@
name: log.level
- external: ecs
name: message
- external: ecs
name: process.command_line
- external: ecs
name: process.pid
- external: ecs
Expand Down
19 changes: 19 additions & 0 deletions test/packages/parallel/apache/docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@ Access logs collects the Apache access logs.
| event.dataset | Event dataset | constant_keyword |
| event.kind | This is one of four ECS Categorization Fields, and indicates the highest level in the ECS category hierarchy. `event.kind` gives high-level information about what type of information the event contains, without being specific to the contents of the event. For example, values of this field distinguish alert events from metric events. The value of this field can be used to inform how these kinds of events should be handled. They may warrant different retention, different access control, it may also help understand whether the data coming in at a regular interval or not. | keyword |
| event.module | Event module | constant_keyword |
| event.original | Raw text message of entire event. Used to demonstrate log integrity or where the full log message (before splitting it up in multiple parts) may be required, e.g. for reindex. This field is not indexed and doc_values are disabled. It cannot be searched, but it can be retrieved from `_source`. If users wish to override this and index this field, please see `Field data types` in the `Elasticsearch Reference`. | keyword |
| event.outcome | This is one of four ECS Categorization Fields, and indicates the lowest level in the ECS category hierarchy. `event.outcome` simply denotes whether the event represents a success or a failure from the perspective of the entity that produced the event. Note that when a single transaction is described in multiple events, each event may populate different values of `event.outcome`, according to their perspective. Also note that in the case of a compound event (a single event that contains multiple logical events), this field should be populated with the value that best captures the overall success or failure from the perspective of the event producer. Further note that not all events will have an associated outcome. For example, this field is generally not populated for metric events, events with `event.type:info`, or any events for which an outcome does not make logical sense. | keyword |
| file.path | Full path to the file, including the file name. It should include the drive letter, when appropriate. | keyword |
| file.path.text | Multi-field of `file.path`. | match_only_text |
| host.architecture | Operating system architecture. | keyword |
| host.containerized | If the host is a container. | boolean |
| host.domain | Name of the domain of which the host is a member. For example, on Windows this could be the host's Active Directory domain or NetBIOS domain name. For Linux this could be the domain of the host's LDAP provider. | keyword |
Expand All @@ -60,6 +62,7 @@ Access logs collects the Apache access logs.
| host.os.family | OS family (such as redhat, debian, freebsd, windows). | keyword |
| host.os.kernel | Operating system kernel version as a raw string. | keyword |
| host.os.name | Operating system name, without the version. | keyword |
| host.os.name.text | Multi-field of `host.os.name`. | text |
| host.os.platform | Operating system platform (such centos, ubuntu, windows). | keyword |
| host.os.version | Operating system version as a raw string. | keyword |
| host.type | Type of host. For Cloud providers this can be the machine type like `t2.medium`. If vm, this could be the container, for example, or other information meaningful in your environment. | keyword |
Expand All @@ -73,11 +76,14 @@ Access logs collects the Apache access logs.
| log.level | Original log level of the log event. If the source of the event provides a log level or textual severity, this is the one that goes in `log.level`. If your source doesn't specify one, you may put your event transport's severity here (e.g. Syslog severity). Some examples are `warn`, `err`, `i`, `informational`. | keyword |
| log.offset | Log offset | long |
| message | For log events the message field contains the log message, optimized for viewing in a log viewer. For structured logs without an original message field, other fields can be concatenated to form a human-readable summary of the event. If multiple messages exist, they can be combined into one message. | match_only_text |
| process.command_line | Full command line that started the process, including the absolute path to the executable, and all arguments. Some arguments may be filtered to protect sensitive information. | wildcard |
| process.command_line.text | Multi-field of `process.command_line`. | match_only_text |
| process.pid | Process id. | long |
| process.thread.id | Thread ID. | long |
| source.address | Some event source addresses are defined ambiguously. The event will sometimes list an IP, a domain or a unix socket. You should always store the raw address in the `.address` field. Then it should be duplicated to `.ip` or `.domain`, depending on which one it is. | keyword |
| source.as.number | Unique number allocated to the autonomous system. The autonomous system number (ASN) uniquely identifies each network on the Internet. | long |
| source.as.organization.name | Organization name. | keyword |
| source.as.organization.name.text | Multi-field of `source.as.organization.name`. | match_only_text |
| source.domain | Source domain. | keyword |
| source.geo.city_name | City name. | keyword |
| source.geo.continent_name | Name of the continent. | keyword |
Expand All @@ -94,14 +100,19 @@ Access logs collects the Apache access logs.
| url.domain | Domain of the url, such as "www.elastic.co". In some cases a URL may refer to an IP and/or port directly, without a domain name. In this case, the IP address would go to the `domain` field. If the URL contains a literal IPv6 address enclosed by `[` and `]` (IETF RFC 2732), the `[` and `]` characters should also be captured in the `domain` field. | keyword |
| url.extension | The field contains the file extension from the original request url, excluding the leading dot. The file extension is only set if it exists, as not every url has a file extension. The leading period must not be included. For example, the value must be "png", not ".png". Note that when the file name has multiple extensions (example.tar.gz), only the last one should be captured ("gz", not "tar.gz"). | keyword |
| url.original | Unmodified original url as seen in the event source. Note that in network monitoring, the observed URL may be a full URL, whereas in access logs, the URL is often just represented as a path. This field is meant to represent the URL as it was observed, complete or not. | wildcard |
| url.original.text | Multi-field of `url.original`. | match_only_text |
| url.path | Path of the request, such as "/search". | wildcard |
| url.query | The query field describes the query string of the request, such as "q=elasticsearch". The `?` is excluded from the query string. If a URL contains no `?`, there is no query field. If there is a `?` but no query, the query field exists with an empty string. The `exists` query can be used to differentiate between the two cases. | keyword |
| user.name | Short name or login of the user. | keyword |
| user.name.text | Multi-field of `user.name`. | match_only_text |
| user_agent.device.name | Name of the device. | keyword |
| user_agent.name | Name of the user agent. | keyword |
| user_agent.original | Unparsed user_agent string. | keyword |
| user_agent.original.text | Multi-field of `user_agent.original`. | match_only_text |
| user_agent.os.full | Operating system name, including the version or code name. | keyword |
| user_agent.os.full.text | Multi-field of `user_agent.os.full`. | match_only_text |
| user_agent.os.name | Operating system name, without the version. | keyword |
| user_agent.os.name.text | Multi-field of `user_agent.os.name`. | match_only_text |
| user_agent.os.version | Operating system version as a raw string. | keyword |
| user_agent.version | Version of the user agent. | keyword |

Expand Down Expand Up @@ -141,6 +152,7 @@ Error logs collects the Apache error logs.
| event.timezone | This field should be populated when the event's timestamp does not include timezone information already (e.g. default Syslog timestamps). It's optional otherwise. Acceptable timezone formats are: a canonical ID (e.g. "Europe/Amsterdam"), abbreviated (e.g. "EST") or an HH:mm differential (e.g. "-05:00"). | keyword |
| event.type | This is one of four ECS Categorization Fields, and indicates the third level in the ECS category hierarchy. `event.type` represents a categorization "sub-bucket" that, when used along with the `event.category` field values, enables filtering events down to a level appropriate for single visualization. This field is an array. This will allow proper categorization of some events that fall in multiple event types. | keyword |
| file.path | Full path to the file, including the file name. It should include the drive letter, when appropriate. | keyword |
| file.path.text | Multi-field of `file.path`. | match_only_text |
| host.architecture | Operating system architecture. | keyword |
| host.containerized | If the host is a container. | boolean |
| host.domain | Name of the domain of which the host is a member. For example, on Windows this could be the host's Active Directory domain or NetBIOS domain name. For Linux this could be the domain of the host's LDAP provider. | keyword |
Expand All @@ -154,6 +166,7 @@ Error logs collects the Apache error logs.
| host.os.family | OS family (such as redhat, debian, freebsd, windows). | keyword |
| host.os.kernel | Operating system kernel version as a raw string. | keyword |
| host.os.name | Operating system name, without the version. | keyword |
| host.os.name.text | Multi-field of `host.os.name`. | text |
| host.os.platform | Operating system platform (such centos, ubuntu, windows). | keyword |
| host.os.version | Operating system version as a raw string. | keyword |
| host.type | Type of host. For Cloud providers this can be the machine type like `t2.medium`. If vm, this could be the container, for example, or other information meaningful in your environment. | keyword |
Expand All @@ -172,6 +185,7 @@ Error logs collects the Apache error logs.
| source.address | Some event source addresses are defined ambiguously. The event will sometimes list an IP, a domain or a unix socket. You should always store the raw address in the `.address` field. Then it should be duplicated to `.ip` or `.domain`, depending on which one it is. | keyword |
| source.as.number | Unique number allocated to the autonomous system. The autonomous system number (ASN) uniquely identifies each network on the Internet. | long |
| source.as.organization.name | Organization name. | keyword |
| source.as.organization.name.text | Multi-field of `source.as.organization.name`. | match_only_text |
| source.geo.city_name | City name. | keyword |
| source.geo.continent_name | Name of the continent. | keyword |
| source.geo.country_iso_code | Country ISO code. | keyword |
Expand All @@ -185,13 +199,17 @@ Error logs collects the Apache error logs.
| url.domain | Domain of the url, such as "www.elastic.co". In some cases a URL may refer to an IP and/or port directly, without a domain name. In this case, the IP address would go to the `domain` field. If the URL contains a literal IPv6 address enclosed by `[` and `]` (IETF RFC 2732), the `[` and `]` characters should also be captured in the `domain` field. | keyword |
| url.extension | The field contains the file extension from the original request url, excluding the leading dot. The file extension is only set if it exists, as not every url has a file extension. The leading period must not be included. For example, the value must be "png", not ".png". Note that when the file name has multiple extensions (example.tar.gz), only the last one should be captured ("gz", not "tar.gz"). | keyword |
| url.original | Unmodified original url as seen in the event source. Note that in network monitoring, the observed URL may be a full URL, whereas in access logs, the URL is often just represented as a path. This field is meant to represent the URL as it was observed, complete or not. | wildcard |
| url.original.text | Multi-field of `url.original`. | match_only_text |
| url.path | Path of the request, such as "/search". | wildcard |
| url.query | The query field describes the query string of the request, such as "q=elasticsearch". The `?` is excluded from the query string. If a URL contains no `?`, there is no query field. If there is a `?` but no query, the query field exists with an empty string. The `exists` query can be used to differentiate between the two cases. | keyword |
| user.name | Short name or login of the user. | keyword |
| user.name.text | Multi-field of `user.name`. | match_only_text |
| user_agent.device.name | Name of the device. | keyword |
| user_agent.name | Name of the user agent. | keyword |
| user_agent.original | Unparsed user_agent string. | keyword |
| user_agent.original.text | Multi-field of `user_agent.original`. | match_only_text |
| user_agent.os.name | Operating system name, without the version. | keyword |
| user_agent.os.name.text | Multi-field of `user_agent.os.name`. | match_only_text |


## Metrics
Expand Down Expand Up @@ -378,6 +396,7 @@ An example event for `status` looks as following:
| host.os.family | OS family (such as redhat, debian, freebsd, windows). | keyword | | |
| host.os.kernel | Operating system kernel version as a raw string. | keyword | | |
| host.os.name | Operating system name, without the version. | keyword | | |
| host.os.name.text | Multi-field of `host.os.name`. | text | | |
| host.os.platform | Operating system platform (such centos, ubuntu, windows). | keyword | | |
| host.os.version | Operating system version as a raw string. | keyword | | |
| host.type | Type of host. For Cloud providers this can be the machine type like `t2.medium`. If vm, this could be the container, for example, or other information meaningful in your environment. | keyword | | |
Expand Down
Loading

0 comments on commit 4845791

Please sign in to comment.