From 06d1226b9a369316a06232c0c0e99a02df64a2e0 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 20 Oct 2016 17:18:04 +0100 Subject: [PATCH] Rewrite exact match regexes to use tsdb index This commit adds support for replacing regexes with non-regex conditions when possible. Currently the following regexes are supported: - host =~ /^foo$/ will be converted into host = 'foo' - host !~ /^foo$/ will be converted into host != 'foo' Note: if the regex expression contains character classes, grouping, repetition or similar, it may not be rewritten. For example, the condition: name =~ /^foo|bar$/ will not be rewritten. Support for this may arrive in the future. Regexes that can be converted into simpler expression will be able to take advantage of the tsdb index, making them significantly faster. --- CHANGELOG.md | 1 + coordinator/statement_executor.go | 3 ++ influxql/ast.go | 88 +++++++++++++++++++++++++++++++ influxql/ast_test.go | 69 ++++++++++++++++++++++++ influxql/parser_test.go | 2 +- 5 files changed, 162 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d0999b73ff..4c0dac3b1ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ The following configuration changes in the `[data]` section may need to changed - [#7496](https://github.com/influxdata/influxdb/pull/7496): Filter out series within shards that do not have data for that series. - [#7480](https://github.com/influxdata/influxdb/pull/7480): Improve compaction planning performance by caching tsm file stats. - [#7320](https://github.com/influxdata/influxdb/issues/7320): Update defaults in config for latest best practices +- [#7495](https://github.com/influxdata/influxdb/pull/7495): Rewrite regexes of the form host = /^server-a$/ to host = 'server-a', to take advantage of the tsdb index. ### Bugfixes diff --git a/coordinator/statement_executor.go b/coordinator/statement_executor.go index 2d0c3799b55..b84da9f7c52 100644 --- a/coordinator/statement_executor.go +++ b/coordinator/statement_executor.go @@ -542,6 +542,9 @@ func (e *StatementExecutor) createIterators(stmt *influxql.SelectStatement, ctx // Remove "time" from fields list. stmt.RewriteTimeFields() + // Rewrite any regex conditions that could make use of the index. + stmt.RewriteRegexConditions() + // Create an iterator creator based on the shards in the cluster. ic, err := e.iteratorCreator(stmt, &opt) if err != nil { diff --git a/influxql/ast.go b/influxql/ast.go index a422408e958..644a5b0bf66 100644 --- a/influxql/ast.go +++ b/influxql/ast.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "regexp" + "regexp/syntax" "sort" "strconv" "strings" @@ -1235,6 +1236,93 @@ func (s *SelectStatement) RewriteFields(ic IteratorCreator) (*SelectStatement, e return other, nil } +// RewriteRegexExprs rewrites regex conditions to make better use of the +// database index. +// +// Conditions that can currently be simplified are: +// +// - host ~= /^foo$/ becomes host = 'foo' +// - host ~! /^foo$/ becomes host != 'foo' +// +// Note: if the regex contains groups, character classes, repetition or +// similar, it's likely it won't be rewritten. In order to support rewriting +// regexes with these characters would be a lot more work. +func (s *SelectStatement) RewriteRegexConditions() { + s.Condition = RewriteExpr(s.Condition, func(e Expr) Expr { + be, ok := e.(*BinaryExpr) + if !ok || (be.Op != EQREGEX && be.Op != NEQREGEX) { + // This expression is not a binary condition or doesn't have a + // regex based operator. + return e + } + + // Handle regex-based condition. + rhs := be.RHS.(*RegexLiteral) // This must be a regex. + + val, ok := matchExactRegex(rhs.Val.String()) + if !ok { + // Regex didn't match. + return e + } + + // Remove leading and trailing ^ and $. + be.RHS = &StringLiteral{Val: val} + + // Update the condition operator. + if be.Op == EQREGEX { + be.Op = EQ + } else { + be.Op = NEQ + } + return be + }) +} + +// matchExactRegex matches regexes that have the following form: /^foo$/. It +// considers /^$/ to be a matching regex. +func matchExactRegex(v string) (string, bool) { + re, err := syntax.Parse(v, syntax.Perl) + if err != nil { + // Nothing we can do or log. + return "", false + } + + if re.Op != syntax.OpConcat { + return "", false + } + + if len(re.Sub) < 2 || len(re.Sub) > 3 { + // Regex has too few or too many subexpressions. + return "", false + } + + start := re.Sub[0] + if !(start.Op == syntax.OpBeginLine || start.Op == syntax.OpBeginText) { + // Regex does not begin with ^ + return "", false + } + + end := re.Sub[len(re.Sub)-1] + if !(end.Op == syntax.OpEndLine || end.Op == syntax.OpEndText) { + // Regex does not end with $ + return "", false + } + + if len(re.Sub) == 3 { + middle := re.Sub[1] + if middle.Op != syntax.OpLiteral { + // Regex does not contain a literal op. + return "", false + } + + // We can rewrite this regex. + return string(middle.Rune), true + } + + // The regex /^$/ + return "", true +} + // RewriteDistinct rewrites the expression to be a call for map/reduce to work correctly // This method assumes all validation has passed func (s *SelectStatement) RewriteDistinct() { diff --git a/influxql/ast_test.go b/influxql/ast_test.go index 672771962e8..f1d3eab582e 100644 --- a/influxql/ast_test.go +++ b/influxql/ast_test.go @@ -452,6 +452,75 @@ func TestSelectStatement_RewriteFields(t *testing.T) { } } +// Test SELECT statement regex conditions rewrite. +func TestSelectStatement_RewriteRegexConditions(t *testing.T) { + var tests = []struct { + in string + out string + }{ + {in: `SELECT value FROM cpu`, out: `SELECT value FROM cpu`}, + {in: `SELECT value FROM cpu WHERE host='server-1'`, out: `SELECT value FROM cpu WHERE host='server-1'`}, + {in: `SELECT value FROM cpu WHERE host = 'server-1'`, out: `SELECT value FROM cpu WHERE host = 'server-1'`}, + {in: `SELECT value FROM cpu WHERE host != 'server-1'`, out: `SELECT value FROM cpu WHERE host != 'server-1'`}, + + // Non matching regex + {in: `SELECT value FROM cpu WHERE host =~ /server-1|server-2|server-3/`, out: `SELECT value FROM cpu WHERE host =~ /server-1|server-2|server-3/`}, + {in: `SELECT value FROM cpu WHERE host =~ /server-1/`, out: `SELECT value FROM cpu WHERE host =~ /server-1/`}, + {in: `SELECT value FROM cpu WHERE host !~ /server-1/`, out: `SELECT value FROM cpu WHERE host !~ /server-1/`}, + {in: `SELECT value FROM cpu WHERE host =~ /^server-1/`, out: `SELECT value FROM cpu WHERE host =~ /^server-1/`}, + {in: `SELECT value FROM cpu WHERE host =~ /server-1$/`, out: `SELECT value FROM cpu WHERE host =~ /server-1$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /\^server-1$/`, out: `SELECT value FROM cpu WHERE host !~ /\^server-1$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /\^$/`, out: `SELECT value FROM cpu WHERE host !~ /\^$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^server-1\$/`, out: `SELECT value FROM cpu WHERE host !~ /^server-1\$/`}, + {in: `SELECT value FROM cpu WHERE host =~ /^\$/`, out: `SELECT value FROM cpu WHERE host =~ /^\$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^a/`, out: `SELECT value FROM cpu WHERE host !~ /^a/`}, + + // These regexes are not supported due to the presence of escaped or meta characters. + {in: `SELECT value FROM cpu WHERE host !~ /^(foo|bar)$/`, out: `SELECT value FROM cpu WHERE host !~ /^(foo|bar)$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^?a$/`, out: `SELECT value FROM cpu WHERE host !~ /^?a$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^[a-z]$/`, out: `SELECT value FROM cpu WHERE host !~ /^[a-z]$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^\d$/`, out: `SELECT value FROM cpu WHERE host !~ /^\d$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^a*$/`, out: `SELECT value FROM cpu WHERE host !~ /^a*$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^a.b$/`, out: `SELECT value FROM cpu WHERE host !~ /^a.b$/`}, + {in: `SELECT value FROM cpu WHERE host !~ /^ab+$/`, out: `SELECT value FROM cpu WHERE host !~ /^ab+$/`}, + {in: `SELECT value FROM cpu WHERE host =~ /^hello\world$/`, out: `SELECT value FROM cpu WHERE host =~ /^hello\world$/`}, + + // These regexes all match and will be rewritten. + {in: `SELECT value FROM cpu WHERE host !~ /^a[2]$/`, out: `SELECT value FROM cpu WHERE host != 'a2'`}, + {in: `SELECT value FROM cpu WHERE host =~ /^server-1$/`, out: `SELECT value FROM cpu WHERE host = 'server-1'`}, + {in: `SELECT value FROM cpu WHERE host !~ /^server-1$/`, out: `SELECT value FROM cpu WHERE host != 'server-1'`}, + {in: `SELECT value FROM cpu WHERE host =~ /^server 1$/`, out: `SELECT value FROM cpu WHERE host = 'server 1'`}, + {in: `SELECT value FROM cpu WHERE host =~ /^$/`, out: `SELECT value FROM cpu WHERE host = ''`}, + {in: `SELECT value FROM cpu WHERE host !~ /^$/`, out: `SELECT value FROM cpu WHERE host != ''`}, + {in: `SELECT value FROM cpu WHERE host =~ /^server-1$/ OR host =~ /^server-2$/`, out: `SELECT value FROM cpu WHERE host = 'server-1' OR host = 'server-2'`}, + {in: `SELECT value FROM cpu WHERE host =~ /^server-1$/ OR host =~ /^server]a$/`, out: `SELECT value FROM cpu WHERE host = 'server-1' OR host = 'server]a'`}, + {in: `SELECT value FROM cpu WHERE host =~ /^hello\?$/`, out: `SELECT value FROM cpu WHERE host = 'hello?'`}, + {in: `SELECT value FROM cpu WHERE host !~ /^\\$/`, out: `SELECT value FROM cpu WHERE host != '\\'`}, + {in: `SELECT value FROM cpu WHERE host !~ /^\\\$$/`, out: `SELECT value FROM cpu WHERE host != '\\$'`}, + } + + for i, test := range tests { + stmt, err := influxql.NewParser(strings.NewReader(test.in)).ParseStatement() + if err != nil { + t.Fatalf("[Example %d], %v", i, err) + } + + // Rewrite any supported regex conditions. + stmt.(*influxql.SelectStatement).RewriteRegexConditions() + + // Get the expected rewritten statement. + expStmt, err := influxql.NewParser(strings.NewReader(test.out)).ParseStatement() + if err != nil { + t.Fatalf("[Example %d], %v", i, err) + } + + // Compare the (potentially) rewritten AST to the expected AST. + if got, exp := stmt, expStmt; !reflect.DeepEqual(got, exp) { + t.Errorf("[Example %d]\nattempting %v\ngot %v\n%s\n\nexpected %v\n%s\n", i+1, test.in, got, mustMarshalJSON(got), exp, mustMarshalJSON(exp)) + } + } +} + // Test SELECT statement time field rewrite. func TestSelectStatement_RewriteTimeFields(t *testing.T) { var tests = []struct { diff --git a/influxql/parser_test.go b/influxql/parser_test.go index 63fe4f74bec..404af148c37 100644 --- a/influxql/parser_test.go +++ b/influxql/parser_test.go @@ -2986,7 +2986,7 @@ func newAlterRetentionPolicyStatement(name string, DB string, d, sd time.Duratio // mustMarshalJSON encodes a value to JSON. func mustMarshalJSON(v interface{}) []byte { - b, err := json.Marshal(v) + b, err := json.MarshalIndent(v, "", " ") if err != nil { panic(err) }