From d4490c94ef04c4c2f72edf40332517a70eda85fd Mon Sep 17 00:00:00 2001 From: Yahor Yuzefovich Date: Mon, 8 Aug 2022 19:25:13 -0700 Subject: [PATCH 1/2] colbuilder: add a microbenchmark for running many render expressions This commit adds a microbenchmark of queries with many render expressions. It'll be used in the following commit to tune when we fall back to wrapping a row-by-row processor to handle those renders. Release note: None --- pkg/sql/colexec/colbuilder/execplan_test.go | 41 +++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/pkg/sql/colexec/colbuilder/execplan_test.go b/pkg/sql/colexec/colbuilder/execplan_test.go index 28582acba521..abc7b4e49225 100644 --- a/pkg/sql/colexec/colbuilder/execplan_test.go +++ b/pkg/sql/colexec/colbuilder/execplan_test.go @@ -12,6 +12,8 @@ package colbuilder import ( "context" + "fmt" + "strings" "testing" "github.com/cockroachdb/cockroach/pkg/base" @@ -159,3 +161,42 @@ func TestNewColOperatorExpectedTypeSchema(t *testing.T) { } require.Equal(t, numRows, rowIdx) } + +// BenchmarkRenderPlanning benchmarks how long it takes to run a query with many +// render expressions inside. With small number of rows to read, the overhead of +// allocating the initial vectors for the projection operators dominates. +func BenchmarkRenderPlanning(b *testing.B) { + defer leaktest.AfterTest(b)() + defer log.Scope(b).Close(b) + + ctx := context.Background() + s, db, _ := serverutils.StartServer(b, base.TestServerArgs{SQLMemoryPoolSize: 10 << 30}) + defer s.Stopper().Stop(ctx) + + jsonValue := `'{"string": "string", "int": 123, "null": null, "nested": {"string": "string", "int": 123, "null": null, "nested": {"string": "string", "int": 123, "null": null}}}'` + + sqlDB := sqlutils.MakeSQLRunner(db) + for _, numRows := range []int{1, 1 << 3, 1 << 6, 1 << 9} { + sqlDB.Exec(b, "DROP TABLE IF EXISTS bench") + sqlDB.Exec(b, "CREATE TABLE bench (id INT PRIMARY KEY, state JSONB)") + sqlDB.Exec(b, fmt.Sprintf(`INSERT INTO bench SELECT i, %s FROM generate_series(1, %d) AS g(i)`, jsonValue, numRows)) + sqlDB.Exec(b, "ANALYZE bench") + for _, numRenders := range []int{1, 1 << 4, 1 << 8, 1 << 12} { + var sb strings.Builder + sb.WriteString("SELECT ") + for i := 0; i < numRenders; i++ { + if i > 0 { + sb.WriteString(", ") + } + sb.WriteString(fmt.Sprintf("state->'nested'->>'nested' AS test%d", i+1)) + } + sb.WriteString(" FROM bench") + query := sb.String() + b.Run(fmt.Sprintf("rows=%d/renders=%d", numRows, numRenders), func(b *testing.B) { + for i := 0; i < b.N; i++ { + sqlDB.Exec(b, query) + } + }) + } + } +} From f5a28fb3689fd08465eb376db356551e8df230ba Mon Sep 17 00:00:00 2001 From: Yahor Yuzefovich Date: Mon, 8 Aug 2022 20:44:46 -0700 Subject: [PATCH 2/2] colbuilder: fall back to row-by-row processor wrapping for many renders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces a mechanism to handle render expressions by wrapping a row-by-row processor into the vectorized flow when 1. the estimated number of rows to go through the renders is relatively small 2. the number of renders is relatively high. The idea is that the vectorized projection operators have higher overhead when many of them are planned AND there is not enough data to amortize the overhead, so to improve the performance in those cases we'll use the row-by-row noop processor. Both of the thresholds are controlled by cluster settings and the defaults were chosen based on a representative microbenchmark. It's worth pointing out that we only have the estimated row count for the scan operators, so the change has limited applicability. ``` RenderPlanning/rows=1/renders=1-24 407µs ± 2% 408µs ± 2% ~ (p=0.684 n=10+10) RenderPlanning/rows=1/renders=8-24 516µs ± 1% 537µs ± 1% +4.05% (p=0.000 n=10+10) RenderPlanning/rows=1/renders=32-24 832µs ± 1% 811µs ± 1% -2.59% (p=0.000 n=10+10) RenderPlanning/rows=1/renders=64-24 1.22ms ± 0% 1.14ms ± 1% -6.62% (p=0.000 n=9+10) RenderPlanning/rows=1/renders=128-24 2.02ms ± 0% 1.80ms ± 1% -11.18% (p=0.000 n=8+9) RenderPlanning/rows=1/renders=512-24 7.75ms ± 1% 5.75ms ± 1% -25.77% (p=0.000 n=10+9) RenderPlanning/rows=1/renders=4096-24 160ms ± 1% 62ms ± 1% -61.51% (p=0.000 n=10+9) RenderPlanning/rows=4/renders=1-24 438µs ± 2% 438µs ± 1% ~ (p=0.853 n=10+10) RenderPlanning/rows=4/renders=8-24 603µs ± 1% 633µs ± 2% +5.06% (p=0.000 n=10+10) RenderPlanning/rows=4/renders=32-24 1.08ms ± 1% 1.08ms ± 1% ~ (p=0.105 n=10+10) RenderPlanning/rows=4/renders=64-24 1.72ms ± 0% 1.62ms ± 0% -5.83% (p=0.000 n=9+9) RenderPlanning/rows=4/renders=128-24 3.01ms ± 1% 2.75ms ± 1% -8.78% (p=0.000 n=10+10) RenderPlanning/rows=4/renders=512-24 11.6ms ± 1% 9.6ms ± 2% -17.58% (p=0.000 n=10+10) RenderPlanning/rows=4/renders=4096-24 192ms ± 2% 91ms ± 2% -52.58% (p=0.000 n=10+10) RenderPlanning/rows=16/renders=1-24 494µs ± 1% 499µs ± 1% +1.03% (p=0.006 n=10+8) RenderPlanning/rows=16/renders=8-24 855µs ± 1% 901µs ± 1% +5.37% (p=0.000 n=10+10) RenderPlanning/rows=16/renders=32-24 2.03ms ± 1% 2.04ms ± 1% ~ (p=0.190 n=10+10) RenderPlanning/rows=16/renders=64-24 3.58ms ± 1% 3.42ms ± 1% -4.56% (p=0.000 n=10+9) RenderPlanning/rows=16/renders=128-24 6.74ms ± 1% 6.31ms ± 1% -6.37% (p=0.000 n=10+10) RenderPlanning/rows=16/renders=512-24 26.9ms ± 1% 24.7ms ± 1% -8.24% (p=0.000 n=9+10) RenderPlanning/rows=16/renders=4096-24 329ms ± 2% 218ms ± 2% -33.66% (p=0.000 n=10+10) RenderPlanning/rows=64/renders=1-24 666µs ± 1% 659µs ± 2% -1.07% (p=0.007 n=10+10) RenderPlanning/rows=64/renders=8-24 1.79ms ± 1% 1.84ms ± 1% +3.01% (p=0.000 n=10+10) RenderPlanning/rows=64/renders=32-24 5.53ms ± 1% 5.79ms ± 2% +4.74% (p=0.000 n=10+10) RenderPlanning/rows=64/renders=64-24 10.8ms ± 1% 11.0ms ± 1% +1.87% (p=0.000 n=10+9) RenderPlanning/rows=64/renders=128-24 21.2ms ± 1% 21.7ms ± 1% +2.71% (p=0.000 n=10+10) RenderPlanning/rows=64/renders=512-24 83.6ms ± 0% 84.9ms ± 0% +1.47% (p=0.000 n=10+7) RenderPlanning/rows=64/renders=4096-24 824ms ± 1% 751ms ± 2% -8.88% (p=0.000 n=10+10) RenderPlanning/rows=128/renders=1-24 853µs ± 1% 851µs ± 1% ~ (p=0.481 n=10+10) RenderPlanning/rows=128/renders=8-24 2.98ms ± 1% 3.11ms ± 1% +4.32% (p=0.000 n=10+10) RenderPlanning/rows=128/renders=32-24 10.4ms ± 1% 10.9ms ± 1% +5.44% (p=0.000 n=10+10) RenderPlanning/rows=128/renders=64-24 20.1ms ± 1% 21.3ms ± 1% +5.99% (p=0.000 n=10+10) RenderPlanning/rows=128/renders=128-24 39.7ms ± 1% 42.1ms ± 2% +5.98% (p=0.000 n=10+10) RenderPlanning/rows=128/renders=512-24 160ms ± 1% 168ms ± 2% +5.13% (p=0.000 n=9+10) RenderPlanning/rows=128/renders=4096-24 1.44s ± 1% 1.48s ± 2% +3.15% (p=0.000 n=9+10) RenderPlanning/rows=256/renders=1-24 1.22ms ± 1% 1.21ms ± 1% -1.01% (p=0.000 n=10+10) RenderPlanning/rows=256/renders=8-24 5.22ms ± 0% 5.19ms ± 1% -0.54% (p=0.011 n=8+9) RenderPlanning/rows=256/renders=32-24 19.9ms ± 1% 20.0ms ± 1% ~ (p=0.182 n=9+10) RenderPlanning/rows=256/renders=64-24 39.0ms ± 0% 38.9ms ± 0% -0.33% (p=0.023 n=10+10) RenderPlanning/rows=256/renders=128-24 76.8ms ± 1% 76.7ms ± 1% ~ (p=0.739 n=10+10) RenderPlanning/rows=256/renders=512-24 316ms ± 1% 319ms ± 1% +1.15% (p=0.001 n=9+10) RenderPlanning/rows=256/renders=4096-24 2.75s ± 1% 2.73s ± 1% -0.64% (p=0.002 n=8+9) ``` Release note: None --- pkg/sql/colexec/colbuilder/BUILD.bazel | 1 + pkg/sql/colexec/colbuilder/execplan.go | 75 +++++++++++++++++- .../execbuilder/testdata/vectorize_wrapping | 78 +++++++++++++++++++ .../execbuilder/tests/local/generated_test.go | 7 ++ 4 files changed, 157 insertions(+), 4 deletions(-) create mode 100644 pkg/sql/opt/exec/execbuilder/testdata/vectorize_wrapping diff --git a/pkg/sql/colexec/colbuilder/BUILD.bazel b/pkg/sql/colexec/colbuilder/BUILD.bazel index 3206277b95a4..3e915e25dbf5 100644 --- a/pkg/sql/colexec/colbuilder/BUILD.bazel +++ b/pkg/sql/colexec/colbuilder/BUILD.bazel @@ -10,6 +10,7 @@ go_library( "//pkg/col/coldata", "//pkg/col/coldataext", "//pkg/col/typeconv", + "//pkg/settings", "//pkg/sql/catalog/descpb", "//pkg/sql/colconv", "//pkg/sql/colexec", diff --git a/pkg/sql/colexec/colbuilder/execplan.go b/pkg/sql/colexec/colbuilder/execplan.go index 23233989c752..5f44473dba6a 100644 --- a/pkg/sql/colexec/colbuilder/execplan.go +++ b/pkg/sql/colexec/colbuilder/execplan.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/col/coldata" "github.com/cockroachdb/cockroach/pkg/col/coldataext" "github.com/cockroachdb/cockroach/pkg/col/typeconv" + "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/colconv" "github.com/cockroachdb/cockroach/pkg/sql/colexec" @@ -1495,7 +1496,7 @@ func NewColOperator( Op: result.Root, ColumnTypes: result.ColumnTypes, } - err = ppr.planPostProcessSpec(ctx, flowCtx, args, post, factory, &r.Releasables) + err = ppr.planPostProcessSpec(ctx, flowCtx, args, post, factory, &r.Releasables, args.Spec.EstimatedRowCount) if err != nil { err = result.wrapPostProcessSpec(ctx, flowCtx, args, post, spec.ProcessorID, factory, err) } else { @@ -1647,6 +1648,46 @@ func (r opResult) wrapPostProcessSpec( ) } +// renderExprCountVisitor counts how many projection operators need to be +// planned across render expressions. +type renderExprCountVisitor struct { + renderCount int64 +} + +var _ tree.Visitor = &renderExprCountVisitor{} + +func (r *renderExprCountVisitor) VisitPre(expr tree.Expr) (recurse bool, newExpr tree.Expr) { + if _, ok := expr.(*tree.IndexedVar); ok { + // IndexedVars don't get a projection operator (we just refer to the + // vector by index), so they don't contribute to the render count. + return false, expr + } + r.renderCount++ + return true, expr +} + +func (r *renderExprCountVisitor) VisitPost(expr tree.Expr) tree.Expr { + return expr +} + +var renderWrappingRowCountThreshold = settings.RegisterIntSetting( + settings.TenantWritable, + "sql.distsql.vectorize_render_wrapping.max_row_count", + "determines the maximum number of estimated rows that flow through the render "+ + "expressions up to which we handle those renders by wrapping a row-by-row processor", + 128, + settings.NonNegativeInt, +) + +var renderWrappingRenderCountThreshold = settings.RegisterIntSetting( + settings.TenantWritable, + "sql.distsql.vectorize_render_wrapping.min_render_count", + "determines the minimum number of render expressions for which we fall "+ + "back to handling renders by wrapping a row-by-row processor", + 16, + settings.NonNegativeInt, +) + // planPostProcessSpec plans the post processing stage specified in post on top // of r.Op. func (r *postProcessResult) planPostProcessSpec( @@ -1656,16 +1697,42 @@ func (r *postProcessResult) planPostProcessSpec( post *execinfrapb.PostProcessSpec, factory coldata.ColumnFactory, releasables *[]execreleasable.Releasable, + estimatedRowCount uint64, ) error { if post.Projection { r.Op, r.ColumnTypes = addProjection(r.Op, r.ColumnTypes, post.OutputColumns) } else if post.RenderExprs != nil { - var renderedCols []uint32 - for _, renderExpr := range post.RenderExprs { - expr, err := args.ExprHelper.ProcessExpr(renderExpr, flowCtx.EvalCtx, r.ColumnTypes) + // Deserialize expressions upfront. + exprs := make([]tree.TypedExpr, len(post.RenderExprs)) + var err error + for i := range exprs { + exprs[i], err = args.ExprHelper.ProcessExpr(post.RenderExprs[i], flowCtx.EvalCtx, r.ColumnTypes) if err != nil { return err } + } + // If we have an estimated row count and it doesn't exceed the wrapping + // row count threshold, we might need to fall back to wrapping a + // row-by-row processor to handle the render expressions (for better + // performance). + if estimatedRowCount != 0 && + estimatedRowCount <= uint64(renderWrappingRowCountThreshold.Get(&flowCtx.Cfg.Settings.SV)) { + renderCountThreshold := renderWrappingRenderCountThreshold.Get(&flowCtx.Cfg.Settings.SV) + // Walk over all expressions and estimate how many projection + // operators will need to be created. + var v renderExprCountVisitor + for _, expr := range exprs { + tree.WalkExpr(&v, expr) + if v.renderCount >= renderCountThreshold { + return errors.Newf( + "falling back to wrapping a row-by-row processor for at least "+ + "%d renders, estimated row count = %d", v.renderCount, estimatedRowCount, + ) + } + } + } + var renderedCols []uint32 + for _, expr := range exprs { var outputIdx int r.Op, outputIdx, r.ColumnTypes, err = planProjectionOperators( ctx, flowCtx.EvalCtx, expr, r.ColumnTypes, r.Op, args.StreamingMemAccount, factory, releasables, diff --git a/pkg/sql/opt/exec/execbuilder/testdata/vectorize_wrapping b/pkg/sql/opt/exec/execbuilder/testdata/vectorize_wrapping new file mode 100644 index 000000000000..29db1e965821 --- /dev/null +++ b/pkg/sql/opt/exec/execbuilder/testdata/vectorize_wrapping @@ -0,0 +1,78 @@ +# LogicTest: local + +statement ok +CREATE TABLE t85632 (k INT PRIMARY KEY); + +statement ok +ALTER TABLE t85632 INJECT STATISTICS '[ + { + "avg_size": 1, + "columns": [ + "k" + ], + "created_at": "2022-08-09 09:00:00.00000", + "distinct_count": 1000, + "name": "__auto__", + "null_count": 0, + "row_count": 1000 + } + ]' + +# Use experimental_always vectorize mode so that we error out when trying to +# wrap a row-by-row processor. +statement ok +SET vectorize=experimental_always; + +# Both the estimated row count exceeds the max wrapping row count and number of +# render is smaller that the min render count, so we use native projection +# operators. +query T +EXPLAIN (VEC) SELECT k + k + k + k FROM t85632 +---- +│ +└ Node 1 + └ *colexecproj.projPlusInt64Int64Op + └ *colexecproj.projPlusInt64Int64Op + └ *colexecproj.projPlusInt64Int64Op + └ *colfetcher.ColBatchScan + +statement ok +SET CLUSTER SETTING sql.distsql.vectorize_render_wrapping.min_render_count = 3; + +# The estimated row count still exceeds the max wrapping row count. +query T +EXPLAIN (VEC) SELECT k + k + k + k FROM t85632 +---- +│ +└ Node 1 + └ *colexecproj.projPlusInt64Int64Op + └ *colexecproj.projPlusInt64Int64Op + └ *colexecproj.projPlusInt64Int64Op + └ *colfetcher.ColBatchScan + +statement ok +SET CLUSTER SETTING sql.distsql.vectorize_render_wrapping.max_row_count = 1000; + +# Now both wrapping conditions are satisfied. +query error falling back to wrapping a row-by-row processor +EXPLAIN (VEC) SELECT k + k + k + k FROM t85632 + +statement ok +RESET CLUSTER SETTING sql.distsql.vectorize_render_wrapping.min_render_count; + +# The render count isn't sufficient for wrapping to kick in. +query T +EXPLAIN (VEC) SELECT k + k + k + k FROM t85632 +---- +│ +└ Node 1 + └ *colexecproj.projPlusInt64Int64Op + └ *colexecproj.projPlusInt64Int64Op + └ *colexecproj.projPlusInt64Int64Op + └ *colfetcher.ColBatchScan + +statement ok +RESET CLUSTER SETTING sql.distsql.vectorize_render_wrapping.max_row_count; + +statement ok +RESET vectorize diff --git a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go index 159af9953155..26aeca7622d4 100644 --- a/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go +++ b/pkg/sql/opt/exec/execbuilder/tests/local/generated_test.go @@ -578,6 +578,13 @@ func TestExecBuild_vectorize_overloads( runExecBuildLogicTest(t, "vectorize_overloads") } +func TestExecBuild_vectorize_wrapping( + t *testing.T, +) { + defer leaktest.AfterTest(t)() + runExecBuildLogicTest(t, "vectorize_wrapping") +} + func TestExecBuild_virtual( t *testing.T, ) {