From b5c54c2a66999c16319bbf75b7f8a19ae98a586c Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Sun, 30 Apr 2023 02:10:58 +0530
Subject: [PATCH 1/9] Enable support for not regex operation

---
 pkg/traceql/ast_validate.go    |   3 +-
 pkg/traceql/test_examples.yaml | 385 ++++++++++++++++-----------------
 2 files changed, 193 insertions(+), 195 deletions(-)

diff --git a/pkg/traceql/ast_validate.go b/pkg/traceql/ast_validate.go
index cf5d9d3cc47..30dff1c4880 100644
--- a/pkg/traceql/ast_validate.go
+++ b/pkg/traceql/ast_validate.go
@@ -178,8 +178,7 @@ func (o BinaryOperation) validate() error {
 	}
 
 	switch o.Op {
-	case OpNotRegex,
-		OpSpansetChild,
+	case OpSpansetChild,
 		OpSpansetDescendant,
 		OpSpansetSibling:
 		return newUnsupportedError(fmt.Sprintf("binary operation (%v)", o.Op))
diff --git a/pkg/traceql/test_examples.yaml b/pkg/traceql/test_examples.yaml
index 6385b8d3e93..03022f728d6 100644
--- a/pkg/traceql/test_examples.yaml
+++ b/pkg/traceql/test_examples.yaml
@@ -1,258 +1,257 @@
 # valid queries parse successfully and return nil when calling .validate()
 valid:
   # spanset filters
-  - '{ true }'
-  - '{ !true }'
-  - '{ true && false }'
-  - '{ true || false }'
-  - '{ 1 = 2 }'
-  - '{ 1 != 2 }'
-  - '{ 1 > 2 }'
-  - '{ 1 >= 2 }'
-  - '{ 1 < 2 }'
-  - '{ 1 <= 2 }'
-  - '{ -1 = 2 }'
+  - "{ true }"
+  - "{ !true }"
+  - "{ true && false }"
+  - "{ true || false }"
+  - "{ 1 = 2 }"
+  - "{ 1 != 2 }"
+  - "{ 1 > 2 }"
+  - "{ 1 >= 2 }"
+  - "{ 1 < 2 }"
+  - "{ 1 <= 2 }"
+  - "{ -1 = 2 }"
   - '{ "test" =~ "test" }'
+  - '{ "test" !~ "test" }'
   - '{ "test" = "test" }'
   - '{ "test" != "test" }'
-  - '{ .a }'
-  - '{ !.a }'
-  - '{ .a && false }'
-  - '{ .a || true }'
-  - '{ .a = 2 }'
-  - '{ .a != 2 }'
-  - '{ .a > 2 }'
-  - '{ .a >= 2 }'
-  - '{ .a < 2 }'
-  - '{ .a <= 2 }'
-  - '{ -.a = 2 }'
+  - "{ .a }"
+  - "{ !.a }"
+  - "{ .a && false }"
+  - "{ .a || true }"
+  - "{ .a = 2 }"
+  - "{ .a != 2 }"
+  - "{ .a > 2 }"
+  - "{ .a >= 2 }"
+  - "{ .a < 2 }"
+  - "{ .a <= 2 }"
+  - "{ -.a = 2 }"
   - '{ .a =~ "test" }'
+  - '{ .a !~ "test" }'
   - '{ .a = "test" }'
   - '{ .a != "test" }'
-  - '{ resource.a != 3 }'
-  - '{ span.a != 3 }'
+  - "{ resource.a != 3 }"
+  - "{ span.a != 3 }"
   - '{ !("test" != .c || ((true && .b) || 3 < .a)) }'
-  - '{ status = ok }'
-  - '{ status = unset }'
-  - '{ status = error }'
-  - '{ status != error }'
-  - '{ kind = internal }'
-  - '{ kind = client }'
-  - '{ kind = consumer }'
-  - '{ duration > 1s }'
-  - '{ 1 < 1h }'
-  - '{ 1 <= 1.1 }'
+  - "{ status = ok }"
+  - "{ status = unset }"
+  - "{ status = error }"
+  - "{ status != error }"
+  - "{ kind = internal }"
+  - "{ kind = client }"
+  - "{ kind = consumer }"
+  - "{ duration > 1s }"
+  - "{ 1 < 1h }"
+  - "{ 1 <= 1.1 }"
   # binary operations
-  - '{ 1 + 1 = 2 }'
-  - '{ 1 - 1 = 2 }'
-  - '{ 1 * 1 = 2 }'
-  - '{ 1 / 1 = 2 }'
-  - '{ 1 ^ 1 = 2 }'
-  - '{ .a + 1 = 2 }'
-  - '{ .a - 1 = 2 }'
-  - '{ .a * 1 = 2 }'
-  - '{ .a / 1 = 2 }'
-  - '{ .a ^ 1 = 2 }'
-  - '{ duration > 1s * 2s }' 
-  - '{ 1 * 1h = 1 }'     # combining float, int and duration can make sense, but can also be weird. we just accept it all
-  - '{ 1 / 1.1 = 1 }'
+  - "{ 1 + 1 = 2 }"
+  - "{ 1 - 1 = 2 }"
+  - "{ 1 * 1 = 2 }"
+  - "{ 1 / 1 = 2 }"
+  - "{ 1 ^ 1 = 2 }"
+  - "{ .a + 1 = 2 }"
+  - "{ .a - 1 = 2 }"
+  - "{ .a * 1 = 2 }"
+  - "{ .a / 1 = 2 }"
+  - "{ .a ^ 1 = 2 }"
+  - "{ duration > 1s * 2s }"
+  - "{ 1 * 1h = 1 }" # combining float, int and duration can make sense, but can also be weird. we just accept it all
+  - "{ 1 / 1.1 = 1 }"
   - '{ .http.status >= "200" }'
   # spanset expressions
-  - '{ true } && { true }'
-  - '{ true } || { true }'
+  - "{ true } && { true }"
+  - "{ true } || { true }"
   # scalar filters
-  - 'avg(.field) > 1'
-  - 'max(duration) >= 1s'
-  - 'max(duration) > 1'            # same note as above for int, float and duration
-  - '{ true } | max(duration) = 1h'
-  - '{ true } | min(duration) = 1h'
-  - '{ true } | sum(duration) = 1h'
-  - '{ true } | max(.a) = 1'
-  - '{ true } | max(span.a) = 1'
-  - '{ true } | max(resource.a) = 1'
-  - '{ true } | max(1 + .a) = 1'
-  - '{ true } | max((1 + .a) * 2) = 1'
-  - 'max(duration) > 3s | { status = error || .http.status = 500 }'
+  - "avg(.field) > 1"
+  - "max(duration) >= 1s"
+  - "max(duration) > 1" # same note as above for int, float and duration
+  - "{ true } | max(duration) = 1h"
+  - "{ true } | min(duration) = 1h"
+  - "{ true } | sum(duration) = 1h"
+  - "{ true } | max(.a) = 1"
+  - "{ true } | max(span.a) = 1"
+  - "{ true } | max(resource.a) = 1"
+  - "{ true } | max(1 + .a) = 1"
+  - "{ true } | max((1 + .a) * 2) = 1"
+  - "max(duration) > 3s | { status = error || .http.status = 500 }"
   # pipelines
-  - '{ true } | { .a }'
-  - '{ true } | count() = 1'
-  - '{ true } | avg(duration) = 1h'
-  - 'count() = 1 | { true }'
-  - '{ true } | count() = 1 | { true }'
+  - "{ true } | { .a }"
+  - "{ true } | count() = 1"
+  - "{ true } | avg(duration) = 1h"
+  - "count() = 1 | { true }"
+  - "{ true } | count() = 1 | { true }"
   # pipeline expressions
-  - '({ true } | count() > 1 | { false }) && ({ true } | count() > 1 | { false })'
-  - '({ true } | count() > 1 | { false }) || ({ true } | count() > 1 | { false })'
-  
+  - "({ true } | count() > 1 | { false }) && ({ true } | count() > 1 | { false })"
+  - "({ true } | count() > 1 | { false }) || ({ true } | count() > 1 | { false })"
+
 # parse_fails throw an error when parsing
 parse_fails:
-  - 'true'
-  - '[ true ]'
-  - '( true )'
+  - "true"
+  - "[ true ]"
+  - "( true )"
   # spanset filters
-  - '{ . }'
-  - '{ < }'
-  - '{ .a < }'
-  - '{ .a < 3'
-  - '{ (.a < 3 }'
-  - '{ attribute = 4 }'           # custom attribute not prefixed with ., span., resource. or parent.
-  - '{ .attribute == 4 }'         # invalid operator
-  - '{ span. }'
+  - "{ . }"
+  - "{ < }"
+  - "{ .a < }"
+  - "{ .a < 3"
+  - "{ (.a < 3 }"
+  - "{ attribute = 4 }" # custom attribute not prefixed with ., span., resource. or parent.
+  - "{ .attribute == 4 }" # invalid operator
+  - "{ span. }"
   # spanset expressions
-  - '{ true } + { true }'
-  - '{ true } - { true }'
-  - '{ true } * { true }'
-  - '{ true } / { true }'
-  - '{ true } ^ { true }'
-  - '{ true } = { true }'         # an interesting operator. possible future addition
-  - '{ true } <= { true }'
-  - '{ true } >= { true }'
-  - '{ true } < { true }'
+  - "{ true } + { true }"
+  - "{ true } - { true }"
+  - "{ true } * { true }"
+  - "{ true } / { true }"
+  - "{ true } ^ { true }"
+  - "{ true } = { true }" # an interesting operator. possible future addition
+  - "{ true } <= { true }"
+  - "{ true } >= { true }"
+  - "{ true } < { true }"
   # scalar expressions must evaluate to a number
   - 'max(name) = "foo"'
   - 'avg("foo") = "bar"'
-  - 'max(status) = ok'
-  - 'max(kind) = consumer'
-  - 'max(duration) < ok'
+  - "max(status) = ok"
+  - "max(kind) = consumer"
+  - "max(duration) < ok"
   - 'min(1) = "foo"'
-  - 'min(parent) = nil'
+  - "min(parent) = nil"
   - 'avg(childCount) > "foo"'
   # scalar filters
-  - 'avg(.field) + 1'             # scalar filters must resolve to boolean
-  - 'sum(3) - 2'
-  - 'min(childCount) && 2'
+  - "avg(.field) + 1" # scalar filters must resolve to boolean
+  - "sum(3) - 2"
+  - "min(childCount) && 2"
   # pipelines
-  - 'coalesce() | { true }'       # pipelines can't start with coalesce
-  - 'count() > 3 && { true }'     # scalar filters have to be in pipeline
-  - '{ true } | count()'          # naked scalar pipelines not allowed
-  - '{ true } | notAnAggregate() = 1'
-  - '{ true } | count = 1'
-  - '{ true } | max() = 1'
-  - '{ true } | by()'
+  - "coalesce() | { true }" # pipelines can't start with coalesce
+  - "count() > 3 && { true }" # scalar filters have to be in pipeline
+  - "{ true } | count()" # naked scalar pipelines not allowed
+  - "{ true } | notAnAggregate() = 1"
+  - "{ true } | count = 1"
+  - "{ true } | max() = 1"
+  - "{ true } | by()"
   # pipeline expressions
-  - '({ true }) + (count()) = 1'
-  - '({ true }) && (count())'
-  - '({ true } | count()) && ({ true } | count()) = 1'
-  - '({ true }) + ({ true }) = 1'
-  - '({ true } | count()) + ({ true } | count())'
+  - "({ true }) + (count()) = 1"
+  - "({ true }) && (count())"
+  - "({ true } | count()) && ({ true } | count()) = 1"
+  - "({ true }) + ({ true }) = 1"
+  - "({ true } | count()) + ({ true } | count())"
   # todo: improve the following
-  - '(by(namespace) | count()) > 2 * 2' # scalar expressions are currently not allowed in scalar pipelines
-  - '(by(namespace) | count()) * 2 > 2'
-  - '2 < (by(namespace) | count())'     # static value needs to be on the RHS to remove conflicts with scalar expressions
+  - "(by(namespace) | count()) > 2 * 2" # scalar expressions are currently not allowed in scalar pipelines
+  - "(by(namespace) | count()) * 2 > 2"
+  - "2 < (by(namespace) | count())" # static value needs to be on the RHS to remove conflicts with scalar expressions
 
 # validate_fails parse correctly and return an error **besides unsupported** when calling .validate()
 validate_fails:
   # span expressions must evaluate to a boolean
-  - '{ status }'
-  - '{ kind }'
-  - '{ ok }'
-  - '{ 1.1 }'
-  - '{ 1h }'
+  - "{ status }"
+  - "{ kind }"
+  - "{ ok }"
+  - "{ 1.1 }"
+  - "{ 1h }"
   - '{ "foo" }'
-  - '{ 1 + 1 }'       
+  - "{ 1 + 1 }"
   # binary operators - incorrect types
   - '{ 1 + "foo" = 1 }'
-  - '{ 1 - true = 1 }'
-  - '{ 1 / ok = 1 }'
-  - '{ 1 ^ name = 1 }'
+  - "{ 1 - true = 1 }"
+  - "{ 1 / ok = 1 }"
+  - "{ 1 ^ name = 1 }"
   - '{ 1 = "foo" }'
-  - '{ 1 != true }'
-  - '{ 1 > ok }'
-  - '{ 1 = name }'
-  - '{ 1 =~ 2}'
+  - "{ 1 != true }"
+  - "{ 1 > ok }"
+  - "{ 1 = name }"
+  - "{ 1 =~ 2}"
   - '{ 1 && "foo" }'
-  - '{ 1 || ok }'
-  - '{ true || 1.1 }'
-  - '{ status > ok }'
-  - '{ kind < consumer }'
+  - "{ 1 || ok }"
+  - "{ true || 1.1 }"
+  - "{ status > ok }"
+  - "{ kind < consumer }"
   # unary operators - incorrect types
-  - '{ -true }'
+  - "{ -true }"
   - '{ -"foo" = "bar" }'
-  - '{ -ok = status }'
+  - "{ -ok = status }"
   - '{ -name = "foo" }'
   - '{ !"foo" = "bar" }'
-  - '{ !ok = status }'
-  - '{ !consumer = kind }'
+  - "{ !ok = status }"
+  - "{ !consumer = kind }"
   - '{ !name = "foo" }'
-  - '{ !1 = 1 }'
-  - '{ !1h = 1 }'
-  - '{ !1.1 = 1.1 }'
+  - "{ !1 = 1 }"
+  - "{ !1h = 1 }"
+  - "{ !1.1 = 1.1 }"
   # scalar expressions must evaluate to a number
-  - 'min(1 = 3) = 1'
+  - "min(1 = 3) = 1"
   # scalar expressions must reference the span
-  - 'sum(3) = 2'
-  - 'sum(3) = min(14)'
-  - 'min(2h) < max(duration)'
-  - 'min(3) = max(duration)'
-  - 'min(1) = max(2) + 3'
-  - 'min(1.1 - 3) > 1'
-  - 'max(1h + 2h) > 1'
+  - "sum(3) = 2"
+  - "sum(3) = min(14)"
+  - "min(2h) < max(duration)"
+  - "min(3) = max(duration)"
+  - "min(1) = max(2) + 3"
+  - "min(1.1 - 3) > 1"
+  - "max(1h + 2h) > 1"
 
 # unsupported parse correctly and return an unsupported error when calling .validate()
 unsupported:
   # coalesce - will be valid when supported
-  - '{ true } | coalesce()'
-  - '{ true } | by(1 + .a) | coalesce()'
+  - "{ true } | coalesce()"
+  - "{ true } | by(1 + .a) | coalesce()"
   # by - will be valid when supported
-  - '{ true } | by(.a)'
-  - '{ true } | by(1 + .a)'
-  - 'by(.a) | { true }'
-  - '{ true } | by(name) | count() > 2'
-  - '{ true } | by(.field) | avg(.b) = 2'
+  - "{ true } | by(.a)"
+  - "{ true } | by(1 + .a)"
+  - "by(.a) | { true }"
+  - "{ true } | by(name) | count() > 2"
+  - "{ true } | by(.field) | avg(.b) = 2"
   # by - will *not* be valid when supported - group expressions must reference the span
-  - '{ true } | by(1)'
+  - "{ true } | by(1)"
   - '{ true } | by("foo")'
   # complex scalar filters - will be valid when supported
-  - 'min(.field) < max(duration)'
-  - 'sum(.field) = min(.field)'
-  - 'min(.field) + max(.field) > 1'
-  - 'min(.field) + max(childCount) > max(duration) - min(.field)'
-  - 'min(childCount) < 2 / 6'
-  - 'max(1 - (2 + .field)) < avg(3 * duration ^ 2)'
+  - "min(.field) < max(duration)"
+  - "sum(.field) = min(.field)"
+  - "min(.field) + max(.field) > 1"
+  - "min(.field) + max(childCount) > max(duration) - min(.field)"
+  - "min(childCount) < 2 / 6"
+  - "max(1 - (2 + .field)) < avg(3 * duration ^ 2)"
   # aggregates - will be valid when supported
-  - 'min(childCount) < 2'
-  - '{ true } | max(parent.a) = 1'
-  - '{ true } | by(3 * .field - 2) | max(duration) < 1s'
-  - '{ .http.status = 200 } | max(.field) - min(.field) > 3'
+  - "min(childCount) < 2"
+  - "{ true } | max(parent.a) = 1"
+  - "{ true } | by(3 * .field - 2) | max(duration) < 1s"
+  - "{ .http.status = 200 } | max(.field) - min(.field) > 3"
   # parent - will be valid when supported
-  - '{ parent.a != 3 }'
-  - '{ parent.resource.a && true }'
-  - '{ parent.span.a > 3 }'
-  - '{ parent.duration = 1h }'
-  - '{ parent = nil }'
-  - '{ (-(3 / 2) * .test - parent.blerg + .other)^3 = 2 }'
+  - "{ parent.a != 3 }"
+  - "{ parent.resource.a && true }"
+  - "{ parent.span.a > 3 }"
+  - "{ parent.duration = 1h }"
+  - "{ parent = nil }"
+  - "{ (-(3 / 2) * .test - parent.blerg + .other)^3 = 2 }"
   # parent - will not be valid when supported
-  - '{ parent }'
-  - '{ 1 % parent = 1 }'
-  - '{ 1 >= parent }'
-  - '{ -parent = nil }'
-  - '{ !parent = nil }'
+  - "{ parent }"
+  - "{ 1 % parent = 1 }"
+  - "{ 1 >= parent }"
+  - "{ -parent = nil }"
+  - "{ !parent = nil }"
   # nil - will be valid when supported
-  - '{ .foo = nil }'
-  # binary operations - will be valid when supported
-  - '{ "test" !~ "test" }'
-  - '{ .a !~ "test" }'
+  - "{ .foo = nil }"
   # childCount - will be valid when supported
-  - '{ 1 = childCount }'
+  - "{ 1 = childCount }"
   # childCount - will be invalid when supported
   - '{ "foo" = childCount }'
   # spanset operations - will be valid when supported
-  - '{ true } >> { true }'
-  - '{ true } > { true }'
-  - '{ true } ~ { true }'
-  - '({ true } | count() > 1 | { false }) >> ({ true } | count() > 1 | { false })'
-  - '({ true } | count() > 1 | { false }) > ({ true } | count() > 1 | { false })'
-  - '({ true } | count() > 1 | { false }) ~ ({ true } | count() > 1 | { false })'
+  - "{ true } >> { true }"
+  - "{ true } > { true }"
+  - "{ true } ~ { true }"
+  - "({ true } | count() > 1 | { false }) >> ({ true } | count() > 1 | { false })"
+  - "({ true } | count() > 1 | { false }) > ({ true } | count() > 1 | { false })"
+  - "({ true } | count() > 1 | { false }) ~ ({ true } | count() > 1 | { false })"
   # spanset pipelines + scalar filters - will be valid when supported
-  - '{ true } | count() + count() = 1' 
-  - '({ true } | count()) + ({ true } | count()) = 1'
-  - '({ true } | count()) - ({ true } | count()) <= 1'
-  - '({ true } | count()) / ({ true } | count()) > ({ true } | count()) / ({ true } | count())'
-  - '({ true } | count()) * ({ true } | count()) < ({ true } | count()) / ({ true } | count())'
-  - '({ .http.status = 200 } | count()) + ({ name = `foo` } | avg(duration)) = 2'
-  - '({ .a } | count()) > ({ .b } | count())'
+  - "{ true } | count() + count() = 1"
+  - "({ true } | count()) + ({ true } | count()) = 1"
+  - "({ true } | count()) - ({ true } | count()) <= 1"
+  - "({ true } | count()) / ({ true } | count()) > ({ true } | count()) / ({ true } | count())"
+  - "({ true } | count()) * ({ true } | count()) < ({ true } | count()) / ({ true } | count())"
+  - "({ .http.status = 200 } | count()) + ({ name = `foo` } | avg(duration)) = 2"
+  - "({ .a } | count()) > ({ .b } | count())"
   # other scalar filters. no idea if these should be supported
-  - '3 = 2'                       # naked scalar filter, technically allowed
-  - 'avg(.field) > 1 - 3'         # scalar expressions in scalar filters are currently not allowed. possible future addition
+  - "3 = 2" # naked scalar filter, technically allowed
+  - "avg(.field) > 1 - 3" # scalar expressions in scalar filters are currently not allowed. possible future addition
 
 # parsed and the ast is dumped to stdout. this is a debugging tool
-dump:
\ No newline at end of file
+dump:

From 54239cb45877261f5c1394d8d5c43128e26c075c Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Sun, 30 Apr 2023 02:11:22 +0530
Subject: [PATCH 2/9] Add predicate to match not regex operations

---
 pkg/parquetquery/predicate_test.go | 188 ++++++++++++++++++++++-------
 pkg/parquetquery/predicates.go     |  45 ++++---
 2 files changed, 175 insertions(+), 58 deletions(-)

diff --git a/pkg/parquetquery/predicate_test.go b/pkg/parquetquery/predicate_test.go
index c93e75e3765..bbafc578539 100644
--- a/pkg/parquetquery/predicate_test.go
+++ b/pkg/parquetquery/predicate_test.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/google/uuid"
 	"github.com/segmentio/parquet-go"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -32,41 +33,153 @@ func (p *mockPredicate) KeepValue(parquet.Value) bool             { p.valCalled
 func (p *mockPredicate) KeepPage(parquet.Page) bool               { p.pageCalled = true; return p.ret }
 func (p *mockPredicate) KeepColumnChunk(parquet.ColumnChunk) bool { p.chunkCalled = true; return p.ret }
 
+type predicateTestCase struct {
+	testName   string
+	writeData  func(w *parquet.Writer) //nolint:all
+	keptChunks int
+	keptPages  int
+	keptValues int
+	predicate  Predicate
+}
+
 func TestSubstringPredicate(t *testing.T) {
+	testCases := []predicateTestCase{
+		{
+			testName:   "all chunks/pages/values inspected",
+			predicate:  NewSubstringPredicate("b"),
+			keptChunks: 1,
+			keptPages:  1,
+			keptValues: 2,
+			writeData: func(w *parquet.Writer) { //nolint:all
+				type String struct {
+					S string `parquet:",dict"`
+				}
+				require.NoError(t, w.Write(&String{"abc"})) // kept
+				require.NoError(t, w.Write(&String{"bcd"})) // kept
+				require.NoError(t, w.Write(&String{"cde"})) // skipped
+			},
+		},
+		{
+			testName:   "dictionary in the page header allows for skipping a page",
+			predicate:  NewSubstringPredicate("x"), // Not present in any values
+			keptChunks: 1,
+			keptPages:  0,
+			keptValues: 0,
+			writeData: func(w *parquet.Writer) { //nolint:all
+				type dictString struct {
+					S string `parquet:",dict"`
+				}
+				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&dictString{"abc"}))
+			},
+		},
+	}
+
+	for _, tC := range testCases {
+		t.Run(tC.testName, func(t *testing.T) {
+			testPredicate(t, tC)
+		})
+	}
+}
+
+func TestNewRegexInPredicate(t *testing.T) {
+	testCases := []predicateTestCase{
+		{
+			testName: "all chunks/pages/values inspected",
+			predicate: func() Predicate {
+				pred, err := NewRegexInPredicate([]string{"a.*"})
+				assert.NoError(t, err)
+				return pred
+			}(),
+			keptChunks: 1,
+			keptPages:  1,
+			keptValues: 2,
+			writeData: func(w *parquet.Writer) { //nolint:all
+				type String struct {
+					S string `parquet:",dict"`
+				}
+				require.NoError(t, w.Write(&String{"abc"})) // kept
+				require.NoError(t, w.Write(&String{"acd"})) // kept
+				require.NoError(t, w.Write(&String{"cde"})) // skipped
+			},
+		},
+		{
+			testName: "dictionary in the page header allows for skipping a page",
+			predicate: func() Predicate {
+				pred, err := NewRegexInPredicate([]string{"x.*"})
+				assert.NoError(t, err)
+				return pred
+			}(), // Not present in any values
+			keptChunks: 1,
+			keptPages:  0,
+			keptValues: 0,
+			writeData: func(w *parquet.Writer) { //nolint:all
+				type dictString struct {
+					S string `parquet:",dict"`
+				}
+				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&dictString{"abc"}))
+			},
+		},
+	}
 
-	// Normal case - all chunks/pages/values inspected
-	testPredicate(t, predicateTestCase{
-		predicate:  NewSubstringPredicate("b"),
-		keptChunks: 1,
-		keptPages:  1,
-		keptValues: 2,
-		writeData: func(w *parquet.Writer) { //nolint:all
-			type String struct {
-				S string `parquet:",dict"`
-			}
-			require.NoError(t, w.Write(&String{"abc"})) // kept
-			require.NoError(t, w.Write(&String{"bcd"})) // kept
-			require.NoError(t, w.Write(&String{"cde"})) // skipped
+	for _, tC := range testCases {
+		t.Run(tC.testName, func(t *testing.T) {
+			testPredicate(t, tC)
+		})
+	}
+}
+
+func TestNewRegexNotInPredicate(t *testing.T) {
+	testCases := []predicateTestCase{
+		{
+			testName: "all chunks/pages/values inspected",
+			predicate: func() Predicate {
+				pred, err := NewRegexNotInPredicate([]string{"a.*"})
+				assert.NoError(t, err)
+				return pred
+			}(),
+			keptChunks: 1,
+			keptPages:  1,
+			keptValues: 2,
+			writeData: func(w *parquet.Writer) { //nolint:all
+				type String struct {
+					S string `parquet:",dict"`
+				}
+				require.NoError(t, w.Write(&String{"abc"})) // skipped
+				require.NoError(t, w.Write(&String{"acd"})) // skipped
+				require.NoError(t, w.Write(&String{"cde"})) // kept
+				require.NoError(t, w.Write(&String{"xde"})) // kept
+			},
 		},
-	})
-
-	// Dictionary in the page header allows for skipping a page
-	testPredicate(t, predicateTestCase{
-		predicate:  NewSubstringPredicate("x"), // Not present in any values
-		keptChunks: 1,
-		keptPages:  0,
-		keptValues: 0,
-		writeData: func(w *parquet.Writer) { //nolint:all
-			type dictString struct {
-				S string `parquet:",dict"`
-			}
-			require.NoError(t, w.Write(&dictString{"abc"}))
-			require.NoError(t, w.Write(&dictString{"abc"}))
-			require.NoError(t, w.Write(&dictString{"abc"}))
-			require.NoError(t, w.Write(&dictString{"abc"}))
-			require.NoError(t, w.Write(&dictString{"abc"}))
+		{
+			testName: "dictionary in the page header allows for skipping a page",
+			predicate: func() Predicate {
+				pred, err := NewRegexNotInPredicate([]string{"x.*"})
+				assert.NoError(t, err)
+				return pred
+			}(), // Not present in any values
+			keptChunks: 1,
+			keptPages:  0,
+			keptValues: 0,
+			writeData: func(w *parquet.Writer) { //nolint:all
+				type dictString struct {
+					S string `parquet:",dict"`
+				}
+				require.NoError(t, w.Write(&dictString{"xyz"}))
+				require.NoError(t, w.Write(&dictString{"xyz"}))
+			},
 		},
-	})
+	}
+
+	for _, tC := range testCases {
+		t.Run(tC.testName, func(t *testing.T) {
+			testPredicate(t, tC)
+		})
+	}
 }
 
 // TestOrPredicateCallsKeepColumnChunk ensures that the OrPredicate calls
@@ -120,17 +233,10 @@ func TestOrPredicateCallsKeepColumnChunk(t *testing.T) {
 	}
 }
 
-type predicateTestCase struct {
-	writeData  func(w *parquet.Writer) //nolint:all
-	keptChunks int
-	keptPages  int
-	keptValues int
-	predicate  Predicate
-}
-
-// testPredicate by writing data and then iterating the column.  The data model
-// must contain a single column.
+// testPredicate by writing data and then iterating the column.
+// The data model must contain a single column.
 func testPredicate(t *testing.T, tc predicateTestCase) {
+	t.Helper()
 	buf := new(bytes.Buffer)
 	w := parquet.NewWriter(buf)
 	tc.writeData(w)
diff --git a/pkg/parquetquery/predicates.go b/pkg/parquetquery/predicates.go
index 0acd8d403c3..f5b78edd556 100644
--- a/pkg/parquetquery/predicates.go
+++ b/pkg/parquetquery/predicates.go
@@ -82,21 +82,33 @@ func (p *StringInPredicate) KeepPage(page pq.Page) bool {
 	return p.helper.keepPage(page, p.KeepValue)
 }
 
-// RegexInPredicate checks for match against any of the given regexs.
-// Memoized and resets on each row group.
-type RegexInPredicate struct {
-	regs    []*regexp.Regexp
-	matches map[string]bool
+type regexPredicate struct {
+	regs        []*regexp.Regexp
+	matches     map[string]bool
+	shouldMatch bool
 
 	helper DictionaryPredicateHelper
 }
 
-var _ Predicate = (*RegexInPredicate)(nil)
+var _ Predicate = (*regexPredicate)(nil)
+
+// NewRegexInPredicate checks for match against any of the given regexs.
+// Memoized and resets on each row group.
+func NewRegexInPredicate(regs []string) (Predicate, error) {
+	return newRegexPredicate(regs, true)
+}
+
+// NewRegexNotInPredicate checks for values that not match against any of the given regexs.
+// Memoized and resets on each row group.
+func NewRegexNotInPredicate(regs []string) (Predicate, error) {
+	return newRegexPredicate(regs, false)
+}
 
-func NewRegexInPredicate(regs []string) (*RegexInPredicate, error) {
-	p := &RegexInPredicate{
-		regs:    make([]*regexp.Regexp, 0, len(regs)),
-		matches: make(map[string]bool),
+func newRegexPredicate(regs []string, shouldMatch bool) (Predicate, error) {
+	p := &regexPredicate{
+		regs:        make([]*regexp.Regexp, 0, len(regs)),
+		matches:     make(map[string]bool),
+		shouldMatch: shouldMatch,
 	}
 	for _, reg := range regs {
 		r, err := regexp.Compile(reg)
@@ -108,7 +120,7 @@ func NewRegexInPredicate(regs []string) (*RegexInPredicate, error) {
 	return p, nil
 }
 
-func (p *RegexInPredicate) String() string {
+func (p *regexPredicate) String() string {
 	var strings string
 	for _, s := range p.regs {
 		strings += fmt.Sprintf("%s, ", s.String())
@@ -116,9 +128,8 @@ func (p *RegexInPredicate) String() string {
 	return fmt.Sprintf("RegexInPredicate{%s}", strings)
 }
 
-func (p *RegexInPredicate) keep(v *pq.Value) bool {
+func (p *regexPredicate) keep(v *pq.Value) bool {
 	if v.IsNull() {
-		// Null
 		return false
 	}
 
@@ -129,7 +140,7 @@ func (p *RegexInPredicate) keep(v *pq.Value) bool {
 
 	matched := false
 	for _, r := range p.regs {
-		if r.MatchString(s) {
+		if r.MatchString(s) == p.shouldMatch {
 			matched = true
 			break
 		}
@@ -139,7 +150,7 @@ func (p *RegexInPredicate) keep(v *pq.Value) bool {
 	return matched
 }
 
-func (p *RegexInPredicate) KeepColumnChunk(cc pq.ColumnChunk) bool {
+func (p *regexPredicate) KeepColumnChunk(cc pq.ColumnChunk) bool {
 	p.helper.setNewRowGroup()
 
 	// Reset match cache on each row group change
@@ -149,11 +160,11 @@ func (p *RegexInPredicate) KeepColumnChunk(cc pq.ColumnChunk) bool {
 	return true
 }
 
-func (p *RegexInPredicate) KeepValue(v pq.Value) bool {
+func (p *regexPredicate) KeepValue(v pq.Value) bool {
 	return p.keep(&v)
 }
 
-func (p *RegexInPredicate) KeepPage(page pq.Page) bool {
+func (p *regexPredicate) KeepPage(page pq.Page) bool {
 	return p.helper.keepPage(page, p.KeepValue)
 }
 

From 57c3b27e24471f952d5e96c55090c6d64fc426e5 Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Sun, 30 Apr 2023 02:11:57 +0530
Subject: [PATCH 3/9] Support for searching pattern with not regex op

---
 tempodb/encoding/vparquet/block_traceql.go       | 4 +++-
 tempodb/encoding/vparquet/block_traceql_test.go  | 2 ++
 tempodb/encoding/vparquet2/block_traceql.go      | 4 +++-
 tempodb/encoding/vparquet2/block_traceql_test.go | 2 ++
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tempodb/encoding/vparquet/block_traceql.go b/tempodb/encoding/vparquet/block_traceql.go
index 255ab26393b..ce620dbc090 100644
--- a/tempodb/encoding/vparquet/block_traceql.go
+++ b/tempodb/encoding/vparquet/block_traceql.go
@@ -195,7 +195,7 @@ func checkConditions(conditions []traceql.Condition) error {
 		case traceql.OpEqual, traceql.OpNotEqual,
 			traceql.OpGreater, traceql.OpGreaterEqual,
 			traceql.OpLess, traceql.OpLessEqual,
-			traceql.OpRegex:
+			traceql.OpRegex, traceql.OpNotRegex:
 			if opCount != 1 {
 				return fmt.Errorf("operation %v must have exactly 1 argument. condition: %+v", cond.Op, cond)
 			}
@@ -837,6 +837,8 @@ func createStringPredicate(op traceql.Operator, operands traceql.Operands) (parq
 
 	case traceql.OpRegex:
 		return parquetquery.NewRegexInPredicate([]string{s})
+	case traceql.OpNotRegex:
+		return parquetquery.NewRegexNotInPredicate([]string{s})
 	case traceql.OpEqual:
 		return parquetquery.NewStringInPredicate([]string{s}), nil
 	case traceql.OpGreater:
diff --git a/tempodb/encoding/vparquet/block_traceql_test.go b/tempodb/encoding/vparquet/block_traceql_test.go
index 399d7e35900..ff6cf89b9f1 100644
--- a/tempodb/encoding/vparquet/block_traceql_test.go
+++ b/tempodb/encoding/vparquet/block_traceql_test.go
@@ -138,6 +138,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) {
 		traceql.MustExtractFetchSpansRequest(`{.foo = "def"}`),         // String ==
 		traceql.MustExtractFetchSpansRequest(`{.foo != "deg"}`),        // String !=
 		traceql.MustExtractFetchSpansRequest(`{.foo =~ "d.*"}`),        // String Regex
+		traceql.MustExtractFetchSpansRequest(`{.foo !~ "x.*"}`),        // String Not Regex
 		traceql.MustExtractFetchSpansRequest(`{resource.foo = "abc"}`), // Resource-level only
 		traceql.MustExtractFetchSpansRequest(`{span.foo = "def"}`),     // Span-level only
 		traceql.MustExtractFetchSpansRequest(`{.foo}`),                 // Projection only
@@ -222,6 +223,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) {
 		// TODO - Should the below query return data or not?  It does match the resource
 		// makeReq(parse(t, `{.foo = "abc"}`)),                           // This should not return results because the span has overridden this attribute to "def".
 		traceql.MustExtractFetchSpansRequest(`{.foo =~ "xyz.*"}`),                                     // Regex IN
+		traceql.MustExtractFetchSpansRequest(`{.foo !~ ".*"}`),                                        // Regex IN
 		traceql.MustExtractFetchSpansRequest(`{span.bool = true}`),                                    // Bool not match
 		traceql.MustExtractFetchSpansRequest(`{` + LabelDuration + ` >  100s}`),                       // Intrinsic: duration
 		traceql.MustExtractFetchSpansRequest(`{` + LabelStatus + ` = ok}`),                            // Intrinsic: status
diff --git a/tempodb/encoding/vparquet2/block_traceql.go b/tempodb/encoding/vparquet2/block_traceql.go
index 5b4d63fab1a..0172ef7750f 100644
--- a/tempodb/encoding/vparquet2/block_traceql.go
+++ b/tempodb/encoding/vparquet2/block_traceql.go
@@ -196,7 +196,7 @@ func checkConditions(conditions []traceql.Condition) error {
 		case traceql.OpEqual, traceql.OpNotEqual,
 			traceql.OpGreater, traceql.OpGreaterEqual,
 			traceql.OpLess, traceql.OpLessEqual,
-			traceql.OpRegex:
+			traceql.OpRegex, traceql.OpNotRegex:
 			if opCount != 1 {
 				return fmt.Errorf("operation %v must have exactly 1 argument. condition: %+v", cond.Op, cond)
 			}
@@ -825,6 +825,8 @@ func createStringPredicate(op traceql.Operator, operands traceql.Operands) (parq
 
 	case traceql.OpRegex:
 		return parquetquery.NewRegexInPredicate([]string{s})
+	case traceql.OpNotRegex:
+		return parquetquery.NewRegexNotInPredicate([]string{s})
 
 	case traceql.OpEqual:
 		return parquetquery.NewStringInPredicate([]string{s}), nil
diff --git a/tempodb/encoding/vparquet2/block_traceql_test.go b/tempodb/encoding/vparquet2/block_traceql_test.go
index b4e9707928d..0fbb58b5d10 100644
--- a/tempodb/encoding/vparquet2/block_traceql_test.go
+++ b/tempodb/encoding/vparquet2/block_traceql_test.go
@@ -139,6 +139,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) {
 		traceql.MustExtractFetchSpansRequest(`{.foo = "def"}`),         // String ==
 		traceql.MustExtractFetchSpansRequest(`{.foo != "deg"}`),        // String !=
 		traceql.MustExtractFetchSpansRequest(`{.foo =~ "d.*"}`),        // String Regex
+		traceql.MustExtractFetchSpansRequest(`{.foo !~ "x.*"}`),        // String Not Regex
 		traceql.MustExtractFetchSpansRequest(`{resource.foo = "abc"}`), // Resource-level only
 		traceql.MustExtractFetchSpansRequest(`{span.foo = "def"}`),     // Span-level only
 		traceql.MustExtractFetchSpansRequest(`{.foo}`),                 // Projection only
@@ -223,6 +224,7 @@ func TestBackendBlockSearchTraceQL(t *testing.T) {
 		// TODO - Should the below query return data or not?  It does match the resource
 		// makeReq(parse(t, `{.foo = "abc"}`)),                           // This should not return results because the span has overridden this attribute to "def".
 		traceql.MustExtractFetchSpansRequest(`{.foo =~ "xyz.*"}`),                                     // Regex IN
+		traceql.MustExtractFetchSpansRequest(`{.foo !~ ".*"}`),                                        // String Not Regex
 		traceql.MustExtractFetchSpansRequest(`{span.bool = true}`),                                    // Bool not match
 		traceql.MustExtractFetchSpansRequest(`{` + LabelDuration + ` >  100s}`),                       // Intrinsic: duration
 		traceql.MustExtractFetchSpansRequest(`{` + LabelStatus + ` = ok}`),                            // Intrinsic: status

From 8261728b2bbc53f635ebc9913537ac2d621c30ee Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Sun, 30 Apr 2023 02:46:08 +0530
Subject: [PATCH 4/9] Add doc for negated regex

---
 docs/sources/tempo/traceql/_index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/sources/tempo/traceql/_index.md b/docs/sources/tempo/traceql/_index.md
index eb5592e54ba..782c2fb14f0 100644
--- a/docs/sources/tempo/traceql/_index.md
+++ b/docs/sources/tempo/traceql/_index.md
@@ -120,6 +120,7 @@ The implemented comparison operators are:
 - `<` (less than)
 - `<=` (less than or equal to)
 - `=~` (regular expression)
+- `!~` (negated regular expression)
 
 TraceQL uses Golang regular expressions. Online regular expression testing sites like https://regex101.com/ are convenient to validate regular expressions used in TraceQL queries.
 

From b012030b133d0f88e968bf3548ba98903b2c857e Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Sun, 30 Apr 2023 02:51:33 +0530
Subject: [PATCH 5/9] Update changelog

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b6725572300..a3704987c09 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## main / unreleased
 
-* [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.azure` and `storage.trace.gcs` [#2362](https://github.com/grafana/tempo/pull/2386) (@kousikmitra)
+* [ENHANCEMENT] Add support to filter using negated regex operator `!~` [#2410](https://github.com/grafana/tempo/pull/2410) (@kousikmitra)
+* [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.azure` and `storage.trace.gcs` [#2386](https://github.com/grafana/tempo/pull/2386) (@kousikmitra)
 * [ENHANCEMENT] Add `prefix` configuration option to `storage.trace.s3` [#2362](https://github.com/grafana/tempo/pull/2362) (@kousikmitra)
 * [FEATURE] Add support for `q` query param in `/api/v2/search/<tag.name>/values` to filter results based on a TraceQL query [#2253](https://github.com/grafana/tempo/pull/2253) (@mapno)
 * [ENHANCEMENT] Add `scope` parameter to `/api/search/tags` [#2282](https://github.com/grafana/tempo/pull/2282) (@joe-elliott)

From 47587a1bfa4c2fdb89bf654aedcb76fd159a1555 Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Mon, 1 May 2023 17:51:43 +0530
Subject: [PATCH 6/9] Replace asserts with require

---
 pkg/parquetquery/predicate_test.go | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pkg/parquetquery/predicate_test.go b/pkg/parquetquery/predicate_test.go
index bbafc578539..b8f907b1f32 100644
--- a/pkg/parquetquery/predicate_test.go
+++ b/pkg/parquetquery/predicate_test.go
@@ -7,7 +7,6 @@ import (
 
 	"github.com/google/uuid"
 	"github.com/segmentio/parquet-go"
-	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -91,7 +90,8 @@ func TestNewRegexInPredicate(t *testing.T) {
 			testName: "all chunks/pages/values inspected",
 			predicate: func() Predicate {
 				pred, err := NewRegexInPredicate([]string{"a.*"})
-				assert.NoError(t, err)
+				require.NoError(t, err)
+
 				return pred
 			}(),
 			keptChunks: 1,
@@ -110,7 +110,8 @@ func TestNewRegexInPredicate(t *testing.T) {
 			testName: "dictionary in the page header allows for skipping a page",
 			predicate: func() Predicate {
 				pred, err := NewRegexInPredicate([]string{"x.*"})
-				assert.NoError(t, err)
+				require.NoError(t, err)
+
 				return pred
 			}(), // Not present in any values
 			keptChunks: 1,
@@ -139,7 +140,8 @@ func TestNewRegexNotInPredicate(t *testing.T) {
 			testName: "all chunks/pages/values inspected",
 			predicate: func() Predicate {
 				pred, err := NewRegexNotInPredicate([]string{"a.*"})
-				assert.NoError(t, err)
+				require.NoError(t, err)
+
 				return pred
 			}(),
 			keptChunks: 1,
@@ -159,7 +161,8 @@ func TestNewRegexNotInPredicate(t *testing.T) {
 			testName: "dictionary in the page header allows for skipping a page",
 			predicate: func() Predicate {
 				pred, err := NewRegexNotInPredicate([]string{"x.*"})
-				assert.NoError(t, err)
+				require.NoError(t, err)
+
 				return pred
 			}(), // Not present in any values
 			keptChunks: 1,

From f81560f67df6fdab6a943b4f9dcf1d4552f75e91 Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Mon, 1 May 2023 18:04:08 +0530
Subject: [PATCH 7/9] move dictString def to pkg level as testDictString

---
 pkg/parquetquery/predicate_test.go | 65 ++++++++++++------------------
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/pkg/parquetquery/predicate_test.go b/pkg/parquetquery/predicate_test.go
index b8f907b1f32..b70c1e2aa6e 100644
--- a/pkg/parquetquery/predicate_test.go
+++ b/pkg/parquetquery/predicate_test.go
@@ -10,8 +10,6 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-var _ Predicate = (*mockPredicate)(nil)
-
 type mockPredicate struct {
 	ret         bool
 	valCalled   bool
@@ -19,6 +17,12 @@ type mockPredicate struct {
 	chunkCalled bool
 }
 
+type testDictString struct {
+	S string `parquet:",dict"`
+}
+
+var _ Predicate = (*mockPredicate)(nil)
+
 func newAlwaysTruePredicate() *mockPredicate {
 	return &mockPredicate{ret: true}
 }
@@ -50,12 +54,10 @@ func TestSubstringPredicate(t *testing.T) {
 			keptPages:  1,
 			keptValues: 2,
 			writeData: func(w *parquet.Writer) { //nolint:all
-				type String struct {
-					S string `parquet:",dict"`
-				}
-				require.NoError(t, w.Write(&String{"abc"})) // kept
-				require.NoError(t, w.Write(&String{"bcd"})) // kept
-				require.NoError(t, w.Write(&String{"cde"})) // skipped
+
+				require.NoError(t, w.Write(&testDictString{"abc"})) // kept
+				require.NoError(t, w.Write(&testDictString{"bcd"})) // kept
+				require.NoError(t, w.Write(&testDictString{"cde"})) // skipped
 			},
 		},
 		{
@@ -65,14 +67,11 @@ func TestSubstringPredicate(t *testing.T) {
 			keptPages:  0,
 			keptValues: 0,
 			writeData: func(w *parquet.Writer) { //nolint:all
-				type dictString struct {
-					S string `parquet:",dict"`
-				}
-				require.NoError(t, w.Write(&dictString{"abc"}))
-				require.NoError(t, w.Write(&dictString{"abc"}))
-				require.NoError(t, w.Write(&dictString{"abc"}))
-				require.NoError(t, w.Write(&dictString{"abc"}))
-				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
 			},
 		},
 	}
@@ -98,12 +97,9 @@ func TestNewRegexInPredicate(t *testing.T) {
 			keptPages:  1,
 			keptValues: 2,
 			writeData: func(w *parquet.Writer) { //nolint:all
-				type String struct {
-					S string `parquet:",dict"`
-				}
-				require.NoError(t, w.Write(&String{"abc"})) // kept
-				require.NoError(t, w.Write(&String{"acd"})) // kept
-				require.NoError(t, w.Write(&String{"cde"})) // skipped
+				require.NoError(t, w.Write(&testDictString{"abc"})) // kept
+				require.NoError(t, w.Write(&testDictString{"acd"})) // kept
+				require.NoError(t, w.Write(&testDictString{"cde"})) // skipped
 			},
 		},
 		{
@@ -118,11 +114,8 @@ func TestNewRegexInPredicate(t *testing.T) {
 			keptPages:  0,
 			keptValues: 0,
 			writeData: func(w *parquet.Writer) { //nolint:all
-				type dictString struct {
-					S string `parquet:",dict"`
-				}
-				require.NoError(t, w.Write(&dictString{"abc"}))
-				require.NoError(t, w.Write(&dictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
+				require.NoError(t, w.Write(&testDictString{"abc"}))
 			},
 		},
 	}
@@ -148,13 +141,10 @@ func TestNewRegexNotInPredicate(t *testing.T) {
 			keptPages:  1,
 			keptValues: 2,
 			writeData: func(w *parquet.Writer) { //nolint:all
-				type String struct {
-					S string `parquet:",dict"`
-				}
-				require.NoError(t, w.Write(&String{"abc"})) // skipped
-				require.NoError(t, w.Write(&String{"acd"})) // skipped
-				require.NoError(t, w.Write(&String{"cde"})) // kept
-				require.NoError(t, w.Write(&String{"xde"})) // kept
+				require.NoError(t, w.Write(&testDictString{"abc"})) // skipped
+				require.NoError(t, w.Write(&testDictString{"acd"})) // skipped
+				require.NoError(t, w.Write(&testDictString{"cde"})) // kept
+				require.NoError(t, w.Write(&testDictString{"xde"})) // kept
 			},
 		},
 		{
@@ -169,11 +159,8 @@ func TestNewRegexNotInPredicate(t *testing.T) {
 			keptPages:  0,
 			keptValues: 0,
 			writeData: func(w *parquet.Writer) { //nolint:all
-				type dictString struct {
-					S string `parquet:",dict"`
-				}
-				require.NoError(t, w.Write(&dictString{"xyz"}))
-				require.NoError(t, w.Write(&dictString{"xyz"}))
+				require.NoError(t, w.Write(&testDictString{"xyz"}))
+				require.NoError(t, w.Write(&testDictString{"xyz"}))
 			},
 		},
 	}

From f2ea0d55536aed1bddbde0c8296dac5805984cff Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Mon, 1 May 2023 18:38:44 +0530
Subject: [PATCH 8/9] Add benchmark for regex predicate

---
 pkg/parquetquery/predicate_test.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/pkg/parquetquery/predicate_test.go b/pkg/parquetquery/predicate_test.go
index b70c1e2aa6e..cf36d9da349 100644
--- a/pkg/parquetquery/predicate_test.go
+++ b/pkg/parquetquery/predicate_test.go
@@ -286,3 +286,21 @@ func BenchmarkStringInPredicate(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkRegexInPredicate(b *testing.B) {
+	p, err := NewRegexInPredicate([]string{"abc"})
+	require.NoError(b, err)
+
+	s := make([]parquet.Value, 1000)
+	for i := 0; i < 1000; i++ {
+		s[i] = parquet.ValueOf(uuid.New().String())
+	}
+
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		for _, ss := range s {
+			p.KeepValue(ss)
+		}
+	}
+}

From 3b1c8232c474a8c0cdeb55c2b9f16b1d82728906 Mon Sep 17 00:00:00 2001
From: Kousik Mitra <kousikmitra12@gmail.com>
Date: Mon, 1 May 2023 19:26:17 +0530
Subject: [PATCH 9/9] Revert auto format changes in yaml file

---
 pkg/traceql/test_examples.yaml | 381 +++++++++++++++++----------------
 1 file changed, 191 insertions(+), 190 deletions(-)

diff --git a/pkg/traceql/test_examples.yaml b/pkg/traceql/test_examples.yaml
index 03022f728d6..cf428e70435 100644
--- a/pkg/traceql/test_examples.yaml
+++ b/pkg/traceql/test_examples.yaml
@@ -1,257 +1,258 @@
 # valid queries parse successfully and return nil when calling .validate()
 valid:
   # spanset filters
-  - "{ true }"
-  - "{ !true }"
-  - "{ true && false }"
-  - "{ true || false }"
-  - "{ 1 = 2 }"
-  - "{ 1 != 2 }"
-  - "{ 1 > 2 }"
-  - "{ 1 >= 2 }"
-  - "{ 1 < 2 }"
-  - "{ 1 <= 2 }"
-  - "{ -1 = 2 }"
+  - '{ true }'
+  - '{ !true }'
+  - '{ true && false }'
+  - '{ true || false }'
+  - '{ 1 = 2 }'
+  - '{ 1 != 2 }'
+  - '{ 1 > 2 }'
+  - '{ 1 >= 2 }'
+  - '{ 1 < 2 }'
+  - '{ 1 <= 2 }'
+  - '{ -1 = 2 }'
   - '{ "test" =~ "test" }'
   - '{ "test" !~ "test" }'
   - '{ "test" = "test" }'
   - '{ "test" != "test" }'
-  - "{ .a }"
-  - "{ !.a }"
-  - "{ .a && false }"
-  - "{ .a || true }"
-  - "{ .a = 2 }"
-  - "{ .a != 2 }"
-  - "{ .a > 2 }"
-  - "{ .a >= 2 }"
-  - "{ .a < 2 }"
-  - "{ .a <= 2 }"
-  - "{ -.a = 2 }"
+  - '{ .a }'
+  - '{ !.a }'
+  - '{ .a && false }'
+  - '{ .a || true }'
+  - '{ .a = 2 }'
+  - '{ .a != 2 }'
+  - '{ .a > 2 }'
+  - '{ .a >= 2 }'
+  - '{ .a < 2 }'
+  - '{ .a <= 2 }'
+  - '{ -.a = 2 }'
   - '{ .a =~ "test" }'
   - '{ .a !~ "test" }'
   - '{ .a = "test" }'
   - '{ .a != "test" }'
-  - "{ resource.a != 3 }"
-  - "{ span.a != 3 }"
+  - '{ resource.a != 3 }'
+  - '{ span.a != 3 }'
   - '{ !("test" != .c || ((true && .b) || 3 < .a)) }'
-  - "{ status = ok }"
-  - "{ status = unset }"
-  - "{ status = error }"
-  - "{ status != error }"
-  - "{ kind = internal }"
-  - "{ kind = client }"
-  - "{ kind = consumer }"
-  - "{ duration > 1s }"
-  - "{ 1 < 1h }"
-  - "{ 1 <= 1.1 }"
+  - '{ status = ok }'
+  - '{ status = unset }'
+  - '{ status = error }'
+  - '{ status != error }'
+  - '{ kind = internal }'
+  - '{ kind = client }'
+  - '{ kind = consumer }'
+  - '{ duration > 1s }'
+  - '{ 1 < 1h }'
+  - '{ 1 <= 1.1 }'
   # binary operations
-  - "{ 1 + 1 = 2 }"
-  - "{ 1 - 1 = 2 }"
-  - "{ 1 * 1 = 2 }"
-  - "{ 1 / 1 = 2 }"
-  - "{ 1 ^ 1 = 2 }"
-  - "{ .a + 1 = 2 }"
-  - "{ .a - 1 = 2 }"
-  - "{ .a * 1 = 2 }"
-  - "{ .a / 1 = 2 }"
-  - "{ .a ^ 1 = 2 }"
-  - "{ duration > 1s * 2s }"
-  - "{ 1 * 1h = 1 }" # combining float, int and duration can make sense, but can also be weird. we just accept it all
-  - "{ 1 / 1.1 = 1 }"
+  - '{ 1 + 1 = 2 }'
+  - '{ 1 - 1 = 2 }'
+  - '{ 1 * 1 = 2 }'
+  - '{ 1 / 1 = 2 }'
+  - '{ 1 ^ 1 = 2 }'
+  - '{ .a + 1 = 2 }'
+  - '{ .a - 1 = 2 }'
+  - '{ .a * 1 = 2 }'
+  - '{ .a / 1 = 2 }'
+  - '{ .a ^ 1 = 2 }'
+  - '{ duration > 1s * 2s }' 
+  - '{ 1 * 1h = 1 }'     # combining float, int and duration can make sense, but can also be weird. we just accept it all
+  - '{ 1 / 1.1 = 1 }'
   - '{ .http.status >= "200" }'
   # spanset expressions
-  - "{ true } && { true }"
-  - "{ true } || { true }"
+  - '{ true } && { true }'
+  - '{ true } || { true }'
   # scalar filters
-  - "avg(.field) > 1"
-  - "max(duration) >= 1s"
-  - "max(duration) > 1" # same note as above for int, float and duration
-  - "{ true } | max(duration) = 1h"
-  - "{ true } | min(duration) = 1h"
-  - "{ true } | sum(duration) = 1h"
-  - "{ true } | max(.a) = 1"
-  - "{ true } | max(span.a) = 1"
-  - "{ true } | max(resource.a) = 1"
-  - "{ true } | max(1 + .a) = 1"
-  - "{ true } | max((1 + .a) * 2) = 1"
-  - "max(duration) > 3s | { status = error || .http.status = 500 }"
+  - 'avg(.field) > 1'
+  - 'max(duration) >= 1s'
+  - 'max(duration) > 1'            # same note as above for int, float and duration
+  - '{ true } | max(duration) = 1h'
+  - '{ true } | min(duration) = 1h'
+  - '{ true } | sum(duration) = 1h'
+  - '{ true } | max(.a) = 1'
+  - '{ true } | max(span.a) = 1'
+  - '{ true } | max(resource.a) = 1'
+  - '{ true } | max(1 + .a) = 1'
+  - '{ true } | max((1 + .a) * 2) = 1'
+  - 'max(duration) > 3s | { status = error || .http.status = 500 }'
   # pipelines
-  - "{ true } | { .a }"
-  - "{ true } | count() = 1"
-  - "{ true } | avg(duration) = 1h"
-  - "count() = 1 | { true }"
-  - "{ true } | count() = 1 | { true }"
+  - '{ true } | { .a }'
+  - '{ true } | count() = 1'
+  - '{ true } | avg(duration) = 1h'
+  - 'count() = 1 | { true }'
+  - '{ true } | count() = 1 | { true }'
   # pipeline expressions
-  - "({ true } | count() > 1 | { false }) && ({ true } | count() > 1 | { false })"
-  - "({ true } | count() > 1 | { false }) || ({ true } | count() > 1 | { false })"
-
+  - '({ true } | count() > 1 | { false }) && ({ true } | count() > 1 | { false })'
+  - '({ true } | count() > 1 | { false }) || ({ true } | count() > 1 | { false })'
+  
 # parse_fails throw an error when parsing
 parse_fails:
-  - "true"
-  - "[ true ]"
-  - "( true )"
+  - 'true'
+  - '[ true ]'
+  - '( true )'
   # spanset filters
-  - "{ . }"
-  - "{ < }"
-  - "{ .a < }"
-  - "{ .a < 3"
-  - "{ (.a < 3 }"
-  - "{ attribute = 4 }" # custom attribute not prefixed with ., span., resource. or parent.
-  - "{ .attribute == 4 }" # invalid operator
-  - "{ span. }"
+  - '{ . }'
+  - '{ < }'
+  - '{ .a < }'
+  - '{ .a < 3'
+  - '{ (.a < 3 }'
+  - '{ attribute = 4 }'           # custom attribute not prefixed with ., span., resource. or parent.
+  - '{ .attribute == 4 }'         # invalid operator
+  - '{ span. }'
   # spanset expressions
-  - "{ true } + { true }"
-  - "{ true } - { true }"
-  - "{ true } * { true }"
-  - "{ true } / { true }"
-  - "{ true } ^ { true }"
-  - "{ true } = { true }" # an interesting operator. possible future addition
-  - "{ true } <= { true }"
-  - "{ true } >= { true }"
-  - "{ true } < { true }"
+  - '{ true } + { true }'
+  - '{ true } - { true }'
+  - '{ true } * { true }'
+  - '{ true } / { true }'
+  - '{ true } ^ { true }'
+  - '{ true } = { true }'         # an interesting operator. possible future addition
+  - '{ true } <= { true }'
+  - '{ true } >= { true }'
+  - '{ true } < { true }'
   # scalar expressions must evaluate to a number
   - 'max(name) = "foo"'
   - 'avg("foo") = "bar"'
-  - "max(status) = ok"
-  - "max(kind) = consumer"
-  - "max(duration) < ok"
+  - 'max(status) = ok'
+  - 'max(kind) = consumer'
+  - 'max(duration) < ok'
   - 'min(1) = "foo"'
-  - "min(parent) = nil"
+  - 'min(parent) = nil'
   - 'avg(childCount) > "foo"'
   # scalar filters
-  - "avg(.field) + 1" # scalar filters must resolve to boolean
-  - "sum(3) - 2"
-  - "min(childCount) && 2"
+  - 'avg(.field) + 1'             # scalar filters must resolve to boolean
+  - 'sum(3) - 2'
+  - 'min(childCount) && 2'
   # pipelines
-  - "coalesce() | { true }" # pipelines can't start with coalesce
-  - "count() > 3 && { true }" # scalar filters have to be in pipeline
-  - "{ true } | count()" # naked scalar pipelines not allowed
-  - "{ true } | notAnAggregate() = 1"
-  - "{ true } | count = 1"
-  - "{ true } | max() = 1"
-  - "{ true } | by()"
+  - 'coalesce() | { true }'       # pipelines can't start with coalesce
+  - 'count() > 3 && { true }'     # scalar filters have to be in pipeline
+  - '{ true } | count()'          # naked scalar pipelines not allowed
+  - '{ true } | notAnAggregate() = 1'
+  - '{ true } | count = 1'
+  - '{ true } | max() = 1'
+  - '{ true } | by()'
   # pipeline expressions
-  - "({ true }) + (count()) = 1"
-  - "({ true }) && (count())"
-  - "({ true } | count()) && ({ true } | count()) = 1"
-  - "({ true }) + ({ true }) = 1"
-  - "({ true } | count()) + ({ true } | count())"
+  - '({ true }) + (count()) = 1'
+  - '({ true }) && (count())'
+  - '({ true } | count()) && ({ true } | count()) = 1'
+  - '({ true }) + ({ true }) = 1'
+  - '({ true } | count()) + ({ true } | count())'
   # todo: improve the following
-  - "(by(namespace) | count()) > 2 * 2" # scalar expressions are currently not allowed in scalar pipelines
-  - "(by(namespace) | count()) * 2 > 2"
-  - "2 < (by(namespace) | count())" # static value needs to be on the RHS to remove conflicts with scalar expressions
+  - '(by(namespace) | count()) > 2 * 2' # scalar expressions are currently not allowed in scalar pipelines
+  - '(by(namespace) | count()) * 2 > 2'
+  - '2 < (by(namespace) | count())'     # static value needs to be on the RHS to remove conflicts with scalar expressions
 
 # validate_fails parse correctly and return an error **besides unsupported** when calling .validate()
 validate_fails:
   # span expressions must evaluate to a boolean
-  - "{ status }"
-  - "{ kind }"
-  - "{ ok }"
-  - "{ 1.1 }"
-  - "{ 1h }"
+  - '{ status }'
+  - '{ kind }'
+  - '{ ok }'
+  - '{ 1.1 }'
+  - '{ 1h }'
   - '{ "foo" }'
-  - "{ 1 + 1 }"
+  - '{ 1 + 1 }'       
   # binary operators - incorrect types
   - '{ 1 + "foo" = 1 }'
-  - "{ 1 - true = 1 }"
-  - "{ 1 / ok = 1 }"
-  - "{ 1 ^ name = 1 }"
+  - '{ 1 - true = 1 }'
+  - '{ 1 / ok = 1 }'
+  - '{ 1 ^ name = 1 }'
   - '{ 1 = "foo" }'
-  - "{ 1 != true }"
-  - "{ 1 > ok }"
-  - "{ 1 = name }"
-  - "{ 1 =~ 2}"
+  - '{ 1 != true }'
+  - '{ 1 > ok }'
+  - '{ 1 = name }'
+  - '{ 1 =~ 2}'
+  - '{ 1 !~ 2}'
   - '{ 1 && "foo" }'
-  - "{ 1 || ok }"
-  - "{ true || 1.1 }"
-  - "{ status > ok }"
-  - "{ kind < consumer }"
+  - '{ 1 || ok }'
+  - '{ true || 1.1 }'
+  - '{ status > ok }'
+  - '{ kind < consumer }'
   # unary operators - incorrect types
-  - "{ -true }"
+  - '{ -true }'
   - '{ -"foo" = "bar" }'
-  - "{ -ok = status }"
+  - '{ -ok = status }'
   - '{ -name = "foo" }'
   - '{ !"foo" = "bar" }'
-  - "{ !ok = status }"
-  - "{ !consumer = kind }"
+  - '{ !ok = status }'
+  - '{ !consumer = kind }'
   - '{ !name = "foo" }'
-  - "{ !1 = 1 }"
-  - "{ !1h = 1 }"
-  - "{ !1.1 = 1.1 }"
+  - '{ !1 = 1 }'
+  - '{ !1h = 1 }'
+  - '{ !1.1 = 1.1 }'
   # scalar expressions must evaluate to a number
-  - "min(1 = 3) = 1"
+  - 'min(1 = 3) = 1'
   # scalar expressions must reference the span
-  - "sum(3) = 2"
-  - "sum(3) = min(14)"
-  - "min(2h) < max(duration)"
-  - "min(3) = max(duration)"
-  - "min(1) = max(2) + 3"
-  - "min(1.1 - 3) > 1"
-  - "max(1h + 2h) > 1"
+  - 'sum(3) = 2'
+  - 'sum(3) = min(14)'
+  - 'min(2h) < max(duration)'
+  - 'min(3) = max(duration)'
+  - 'min(1) = max(2) + 3'
+  - 'min(1.1 - 3) > 1'
+  - 'max(1h + 2h) > 1'
 
 # unsupported parse correctly and return an unsupported error when calling .validate()
 unsupported:
   # coalesce - will be valid when supported
-  - "{ true } | coalesce()"
-  - "{ true } | by(1 + .a) | coalesce()"
+  - '{ true } | coalesce()'
+  - '{ true } | by(1 + .a) | coalesce()'
   # by - will be valid when supported
-  - "{ true } | by(.a)"
-  - "{ true } | by(1 + .a)"
-  - "by(.a) | { true }"
-  - "{ true } | by(name) | count() > 2"
-  - "{ true } | by(.field) | avg(.b) = 2"
+  - '{ true } | by(.a)'
+  - '{ true } | by(1 + .a)'
+  - 'by(.a) | { true }'
+  - '{ true } | by(name) | count() > 2'
+  - '{ true } | by(.field) | avg(.b) = 2'
   # by - will *not* be valid when supported - group expressions must reference the span
-  - "{ true } | by(1)"
+  - '{ true } | by(1)'
   - '{ true } | by("foo")'
   # complex scalar filters - will be valid when supported
-  - "min(.field) < max(duration)"
-  - "sum(.field) = min(.field)"
-  - "min(.field) + max(.field) > 1"
-  - "min(.field) + max(childCount) > max(duration) - min(.field)"
-  - "min(childCount) < 2 / 6"
-  - "max(1 - (2 + .field)) < avg(3 * duration ^ 2)"
+  - 'min(.field) < max(duration)'
+  - 'sum(.field) = min(.field)'
+  - 'min(.field) + max(.field) > 1'
+  - 'min(.field) + max(childCount) > max(duration) - min(.field)'
+  - 'min(childCount) < 2 / 6'
+  - 'max(1 - (2 + .field)) < avg(3 * duration ^ 2)'
   # aggregates - will be valid when supported
-  - "min(childCount) < 2"
-  - "{ true } | max(parent.a) = 1"
-  - "{ true } | by(3 * .field - 2) | max(duration) < 1s"
-  - "{ .http.status = 200 } | max(.field) - min(.field) > 3"
+  - 'min(childCount) < 2'
+  - '{ true } | max(parent.a) = 1'
+  - '{ true } | by(3 * .field - 2) | max(duration) < 1s'
+  - '{ .http.status = 200 } | max(.field) - min(.field) > 3'
   # parent - will be valid when supported
-  - "{ parent.a != 3 }"
-  - "{ parent.resource.a && true }"
-  - "{ parent.span.a > 3 }"
-  - "{ parent.duration = 1h }"
-  - "{ parent = nil }"
-  - "{ (-(3 / 2) * .test - parent.blerg + .other)^3 = 2 }"
+  - '{ parent.a != 3 }'
+  - '{ parent.resource.a && true }'
+  - '{ parent.span.a > 3 }'
+  - '{ parent.duration = 1h }'
+  - '{ parent = nil }'
+  - '{ (-(3 / 2) * .test - parent.blerg + .other)^3 = 2 }'
   # parent - will not be valid when supported
-  - "{ parent }"
-  - "{ 1 % parent = 1 }"
-  - "{ 1 >= parent }"
-  - "{ -parent = nil }"
-  - "{ !parent = nil }"
+  - '{ parent }'
+  - '{ 1 % parent = 1 }'
+  - '{ 1 >= parent }'
+  - '{ -parent = nil }'
+  - '{ !parent = nil }'
   # nil - will be valid when supported
-  - "{ .foo = nil }"
+  - '{ .foo = nil }'
   # childCount - will be valid when supported
-  - "{ 1 = childCount }"
+  - '{ 1 = childCount }'
   # childCount - will be invalid when supported
   - '{ "foo" = childCount }'
   # spanset operations - will be valid when supported
-  - "{ true } >> { true }"
-  - "{ true } > { true }"
-  - "{ true } ~ { true }"
-  - "({ true } | count() > 1 | { false }) >> ({ true } | count() > 1 | { false })"
-  - "({ true } | count() > 1 | { false }) > ({ true } | count() > 1 | { false })"
-  - "({ true } | count() > 1 | { false }) ~ ({ true } | count() > 1 | { false })"
+  - '{ true } >> { true }'
+  - '{ true } > { true }'
+  - '{ true } ~ { true }'
+  - '({ true } | count() > 1 | { false }) >> ({ true } | count() > 1 | { false })'
+  - '({ true } | count() > 1 | { false }) > ({ true } | count() > 1 | { false })'
+  - '({ true } | count() > 1 | { false }) ~ ({ true } | count() > 1 | { false })'
   # spanset pipelines + scalar filters - will be valid when supported
-  - "{ true } | count() + count() = 1"
-  - "({ true } | count()) + ({ true } | count()) = 1"
-  - "({ true } | count()) - ({ true } | count()) <= 1"
-  - "({ true } | count()) / ({ true } | count()) > ({ true } | count()) / ({ true } | count())"
-  - "({ true } | count()) * ({ true } | count()) < ({ true } | count()) / ({ true } | count())"
-  - "({ .http.status = 200 } | count()) + ({ name = `foo` } | avg(duration)) = 2"
-  - "({ .a } | count()) > ({ .b } | count())"
+  - '{ true } | count() + count() = 1' 
+  - '({ true } | count()) + ({ true } | count()) = 1'
+  - '({ true } | count()) - ({ true } | count()) <= 1'
+  - '({ true } | count()) / ({ true } | count()) > ({ true } | count()) / ({ true } | count())'
+  - '({ true } | count()) * ({ true } | count()) < ({ true } | count()) / ({ true } | count())'
+  - '({ .http.status = 200 } | count()) + ({ name = `foo` } | avg(duration)) = 2'
+  - '({ .a } | count()) > ({ .b } | count())'
   # other scalar filters. no idea if these should be supported
-  - "3 = 2" # naked scalar filter, technically allowed
-  - "avg(.field) > 1 - 3" # scalar expressions in scalar filters are currently not allowed. possible future addition
+  - '3 = 2'                       # naked scalar filter, technically allowed
+  - 'avg(.field) > 1 - 3'         # scalar expressions in scalar filters are currently not allowed. possible future addition
 
 # parsed and the ast is dumped to stdout. this is a debugging tool
-dump:
+dump:
\ No newline at end of file