Skip to content

Commit

Permalink
Rewrite exact match regexes to use tsdb index
Browse files Browse the repository at this point in the history
This commit adds support for replacing regexes with non-regex conditions
when possible. Currently the following regexes are supported:

 - host =~ /^foo$/ will be converted into host = 'foo'
 - host !~ /^foo$/ will be converted into host != 'foo'

Note: if the regex expression contains character classes, grouping,
repetition or similar, it may not be rewritten.

For example, the condition: name =~ /^foo|bar$/ will not be rewritten.
Support for this may arrive in the future.

Regexes that can be converted into simpler expression will be able to
take advantage of the tsdb index, making them significantly faster.
  • Loading branch information
e-dard committed Oct 25, 2016
1 parent 74c6a0c commit 06d1226
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The following configuration changes in the `[data]` section may need to changed
- [#7496](https://github.com/influxdata/influxdb/pull/7496): Filter out series within shards that do not have data for that series.
- [#7480](https://github.com/influxdata/influxdb/pull/7480): Improve compaction planning performance by caching tsm file stats.
- [#7320](https://github.com/influxdata/influxdb/issues/7320): Update defaults in config for latest best practices
- [#7495](https://github.com/influxdata/influxdb/pull/7495): Rewrite regexes of the form host = /^server-a$/ to host = 'server-a', to take advantage of the tsdb index.

### Bugfixes

Expand Down
3 changes: 3 additions & 0 deletions coordinator/statement_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,9 @@ func (e *StatementExecutor) createIterators(stmt *influxql.SelectStatement, ctx
// Remove "time" from fields list.
stmt.RewriteTimeFields()

// Rewrite any regex conditions that could make use of the index.
stmt.RewriteRegexConditions()

// Create an iterator creator based on the shards in the cluster.
ic, err := e.iteratorCreator(stmt, &opt)
if err != nil {
Expand Down
88 changes: 88 additions & 0 deletions influxql/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"regexp"
"regexp/syntax"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -1235,6 +1236,93 @@ func (s *SelectStatement) RewriteFields(ic IteratorCreator) (*SelectStatement, e
return other, nil
}

// RewriteRegexExprs rewrites regex conditions to make better use of the
// database index.
//
// Conditions that can currently be simplified are:
//
// - host ~= /^foo$/ becomes host = 'foo'
// - host ~! /^foo$/ becomes host != 'foo'
//
// Note: if the regex contains groups, character classes, repetition or
// similar, it's likely it won't be rewritten. In order to support rewriting
// regexes with these characters would be a lot more work.
func (s *SelectStatement) RewriteRegexConditions() {
s.Condition = RewriteExpr(s.Condition, func(e Expr) Expr {
be, ok := e.(*BinaryExpr)
if !ok || (be.Op != EQREGEX && be.Op != NEQREGEX) {
// This expression is not a binary condition or doesn't have a
// regex based operator.
return e
}

// Handle regex-based condition.
rhs := be.RHS.(*RegexLiteral) // This must be a regex.

val, ok := matchExactRegex(rhs.Val.String())
if !ok {
// Regex didn't match.
return e
}

// Remove leading and trailing ^ and $.
be.RHS = &StringLiteral{Val: val}

// Update the condition operator.
if be.Op == EQREGEX {
be.Op = EQ
} else {
be.Op = NEQ
}
return be
})
}

// matchExactRegex matches regexes that have the following form: /^foo$/. It
// considers /^$/ to be a matching regex.
func matchExactRegex(v string) (string, bool) {
re, err := syntax.Parse(v, syntax.Perl)
if err != nil {
// Nothing we can do or log.
return "", false
}

if re.Op != syntax.OpConcat {
return "", false
}

if len(re.Sub) < 2 || len(re.Sub) > 3 {
// Regex has too few or too many subexpressions.
return "", false
}

start := re.Sub[0]
if !(start.Op == syntax.OpBeginLine || start.Op == syntax.OpBeginText) {
// Regex does not begin with ^
return "", false
}

end := re.Sub[len(re.Sub)-1]
if !(end.Op == syntax.OpEndLine || end.Op == syntax.OpEndText) {
// Regex does not end with $
return "", false
}

if len(re.Sub) == 3 {
middle := re.Sub[1]
if middle.Op != syntax.OpLiteral {
// Regex does not contain a literal op.
return "", false
}

// We can rewrite this regex.
return string(middle.Rune), true
}

// The regex /^$/
return "", true
}

// RewriteDistinct rewrites the expression to be a call for map/reduce to work correctly
// This method assumes all validation has passed
func (s *SelectStatement) RewriteDistinct() {
Expand Down
69 changes: 69 additions & 0 deletions influxql/ast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,75 @@ func TestSelectStatement_RewriteFields(t *testing.T) {
}
}

// Test SELECT statement regex conditions rewrite.
func TestSelectStatement_RewriteRegexConditions(t *testing.T) {
var tests = []struct {
in string
out string
}{
{in: `SELECT value FROM cpu`, out: `SELECT value FROM cpu`},
{in: `SELECT value FROM cpu WHERE host='server-1'`, out: `SELECT value FROM cpu WHERE host='server-1'`},
{in: `SELECT value FROM cpu WHERE host = 'server-1'`, out: `SELECT value FROM cpu WHERE host = 'server-1'`},
{in: `SELECT value FROM cpu WHERE host != 'server-1'`, out: `SELECT value FROM cpu WHERE host != 'server-1'`},

// Non matching regex
{in: `SELECT value FROM cpu WHERE host =~ /server-1|server-2|server-3/`, out: `SELECT value FROM cpu WHERE host =~ /server-1|server-2|server-3/`},
{in: `SELECT value FROM cpu WHERE host =~ /server-1/`, out: `SELECT value FROM cpu WHERE host =~ /server-1/`},
{in: `SELECT value FROM cpu WHERE host !~ /server-1/`, out: `SELECT value FROM cpu WHERE host !~ /server-1/`},
{in: `SELECT value FROM cpu WHERE host =~ /^server-1/`, out: `SELECT value FROM cpu WHERE host =~ /^server-1/`},
{in: `SELECT value FROM cpu WHERE host =~ /server-1$/`, out: `SELECT value FROM cpu WHERE host =~ /server-1$/`},
{in: `SELECT value FROM cpu WHERE host !~ /\^server-1$/`, out: `SELECT value FROM cpu WHERE host !~ /\^server-1$/`},
{in: `SELECT value FROM cpu WHERE host !~ /\^$/`, out: `SELECT value FROM cpu WHERE host !~ /\^$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^server-1\$/`, out: `SELECT value FROM cpu WHERE host !~ /^server-1\$/`},
{in: `SELECT value FROM cpu WHERE host =~ /^\$/`, out: `SELECT value FROM cpu WHERE host =~ /^\$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^a/`, out: `SELECT value FROM cpu WHERE host !~ /^a/`},

// These regexes are not supported due to the presence of escaped or meta characters.
{in: `SELECT value FROM cpu WHERE host !~ /^(foo|bar)$/`, out: `SELECT value FROM cpu WHERE host !~ /^(foo|bar)$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^?a$/`, out: `SELECT value FROM cpu WHERE host !~ /^?a$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^[a-z]$/`, out: `SELECT value FROM cpu WHERE host !~ /^[a-z]$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^\d$/`, out: `SELECT value FROM cpu WHERE host !~ /^\d$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^a*$/`, out: `SELECT value FROM cpu WHERE host !~ /^a*$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^a.b$/`, out: `SELECT value FROM cpu WHERE host !~ /^a.b$/`},
{in: `SELECT value FROM cpu WHERE host !~ /^ab+$/`, out: `SELECT value FROM cpu WHERE host !~ /^ab+$/`},
{in: `SELECT value FROM cpu WHERE host =~ /^hello\world$/`, out: `SELECT value FROM cpu WHERE host =~ /^hello\world$/`},

// These regexes all match and will be rewritten.
{in: `SELECT value FROM cpu WHERE host !~ /^a[2]$/`, out: `SELECT value FROM cpu WHERE host != 'a2'`},
{in: `SELECT value FROM cpu WHERE host =~ /^server-1$/`, out: `SELECT value FROM cpu WHERE host = 'server-1'`},
{in: `SELECT value FROM cpu WHERE host !~ /^server-1$/`, out: `SELECT value FROM cpu WHERE host != 'server-1'`},
{in: `SELECT value FROM cpu WHERE host =~ /^server 1$/`, out: `SELECT value FROM cpu WHERE host = 'server 1'`},
{in: `SELECT value FROM cpu WHERE host =~ /^$/`, out: `SELECT value FROM cpu WHERE host = ''`},
{in: `SELECT value FROM cpu WHERE host !~ /^$/`, out: `SELECT value FROM cpu WHERE host != ''`},
{in: `SELECT value FROM cpu WHERE host =~ /^server-1$/ OR host =~ /^server-2$/`, out: `SELECT value FROM cpu WHERE host = 'server-1' OR host = 'server-2'`},
{in: `SELECT value FROM cpu WHERE host =~ /^server-1$/ OR host =~ /^server]a$/`, out: `SELECT value FROM cpu WHERE host = 'server-1' OR host = 'server]a'`},
{in: `SELECT value FROM cpu WHERE host =~ /^hello\?$/`, out: `SELECT value FROM cpu WHERE host = 'hello?'`},
{in: `SELECT value FROM cpu WHERE host !~ /^\\$/`, out: `SELECT value FROM cpu WHERE host != '\\'`},
{in: `SELECT value FROM cpu WHERE host !~ /^\\\$$/`, out: `SELECT value FROM cpu WHERE host != '\\$'`},
}

for i, test := range tests {
stmt, err := influxql.NewParser(strings.NewReader(test.in)).ParseStatement()
if err != nil {
t.Fatalf("[Example %d], %v", i, err)
}

// Rewrite any supported regex conditions.
stmt.(*influxql.SelectStatement).RewriteRegexConditions()

// Get the expected rewritten statement.
expStmt, err := influxql.NewParser(strings.NewReader(test.out)).ParseStatement()
if err != nil {
t.Fatalf("[Example %d], %v", i, err)
}

// Compare the (potentially) rewritten AST to the expected AST.
if got, exp := stmt, expStmt; !reflect.DeepEqual(got, exp) {
t.Errorf("[Example %d]\nattempting %v\ngot %v\n%s\n\nexpected %v\n%s\n", i+1, test.in, got, mustMarshalJSON(got), exp, mustMarshalJSON(exp))
}
}
}

// Test SELECT statement time field rewrite.
func TestSelectStatement_RewriteTimeFields(t *testing.T) {
var tests = []struct {
Expand Down
2 changes: 1 addition & 1 deletion influxql/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2986,7 +2986,7 @@ func newAlterRetentionPolicyStatement(name string, DB string, d, sd time.Duratio

// mustMarshalJSON encodes a value to JSON.
func mustMarshalJSON(v interface{}) []byte {
b, err := json.Marshal(v)
b, err := json.MarshalIndent(v, "", " ")
if err != nil {
panic(err)
}
Expand Down

0 comments on commit 06d1226

Please sign in to comment.