Skip to content

Commit

Permalink
sql: add a rule to push a distinct modifier into a scalargroupby
Browse files Browse the repository at this point in the history
Previously, the optimizer could not take advantage of an index on
a variable with a command like the following:

SELECT COUNT(DISTINCT y) FROM xy;

To address this, PushAggDistinctIntoScalarGroupBy pushes the
distinct operation from the aggregate function and into the
input of the ScalarGroupBy.

Fixes cockroachdb#46899

Release note: None
  • Loading branch information
DrewKimball committed Apr 17, 2020
1 parent a63a670 commit fb507a4
Show file tree
Hide file tree
Showing 9 changed files with 389 additions and 84 deletions.
32 changes: 17 additions & 15 deletions pkg/sql/opt/exec/execbuilder/testdata/aggregate
Original file line number Diff line number Diff line change
Expand Up @@ -211,21 +211,23 @@ group · · (count, count, sum, sum, min, min)
query TTTTT
EXPLAIN (VERBOSE) SELECT count(DISTINCT a.*) FROM kv a, kv b
----
· distributed false · ·
· vectorized false · ·
group · · (count) ·
│ aggregate 0 count(DISTINCT column9) · ·
│ scalar · · ·
└── render · · (column9) ·
│ render 0 ((k, v, w, s) AS k, v, w, s) · ·
└── cross-join · · (k, v, w, s) ·
│ type cross · ·
├── scan · · (k, v, w, s) ·
│ table kv@primary · ·
│ spans FULL SCAN · ·
└── scan · · () ·
· table kv@primary · ·
· spans FULL SCAN · ·
· distributed false · ·
· vectorized false · ·
group · · (count) ·
│ aggregate 0 count(column9) · ·
│ scalar · · ·
└── distinct · · (column9) ·
│ distinct on column9 · ·
└── render · · (column9) ·
│ render 0 ((k, v, w, s) AS k, v, w, s) · ·
└── cross-join · · (k, v, w, s) ·
│ type cross · ·
├── scan · · (k, v, w, s) ·
│ table kv@primary · ·
│ spans FULL SCAN · ·
└── scan · · () ·
· table kv@primary · ·
· spans FULL SCAN · ·

query TTT
SELECT tree, field, description FROM [
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/opt/exec/execbuilder/testdata/distsql_agg
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ https://cockroachdb.github.io/distsqlplan/decode.html#eJyslEFvmzAYhu_7FdZ3ajUjMJ
query T
SELECT url FROM [EXPLAIN (DISTSQL) SELECT SUM (DISTINCT A) FROM data]
----
https://cockroachdb.github.io/distsqlplan/decode.html#eJyslFGLm04Uxd__n2K4T__ABDNqslmfsuymIGSTbUyhsPgwdS5WMI6dGaEl5LsXtSRNyI4S--g4557z88g9gP6RQwDRcrV83pFK5eTTdvNK3pdf31ZP4Zr8_xJGu-jzakT-XNHVvjkL1887wkftdcENj4FCIQWu-R41BO_AgIILFDyg4AOFKcQUSiUT1Fqq-sqhEYTiJwQTCllRVqY-jikkUiEEBzCZyREC2PFvOW6RC1TOBCgINDzLG5tSZXuufi3qDEAhKnmhAzJ2GOGFIIxI8x0VUNhUJiALBvGRgqzM2UsbniIE7Ej753nJtMmKxDjTyzCW-e6H889jq0IqgQrFxdT4eCPBU5oqTLmRymFXXyT68nquaMFGH0byLiKx_hWwPhU4bOy4g0roSHQqYXZnCW5_YrcXsTt2vEHEHYlOxA93Env9ib1exN7Y8QcRdyQ6Ec_vJPb7E_u9iP2xMx1E3JHoRPz4D1bLjflb1KUsNF6tmNuTJ_XqQZFiu6e0rFSCb0omjU37uGl0zYFAbdq3rH0Ii_ZVHfBvMbOK3Qsxuxa7ducOa8-q9u1if0juqVU8szvPhjg_WMVzu_N8iPOjvatJx29i_8muvePjf78DAAD__wbY234=
https://cockroachdb.github.io/distsqlplan/decode.html#eJyslEFvmzAYhu_7FdZ3ajRHxECahlOqNpMipUkXMmlSxcHDnxgSwcw20qYo_30CpnRUi6EjR4xfnvfhs3wE_SODAMLlevmwJ6XKyKfd9om8LL8-r-9XG3LzuAr34ef1iPzZostDvbbaPOwJHzXbBTc8Agq5FLjhB9QQvAADCi5Q8ICCDxSmEFEolIxRa6mqLcc6sBI_IZhQSPOiNNVyRCGWCiE4gklNhhDAnn_LcIdcoHImQEGg4WlWYwqVHrj6tag6AIWw4LkOyNhhhOeCMCLNd1RAYVuagCwYRCcKsjSvLG14ghCwE-3f5zHVJs1j40zbZRaV9FYJVCisNPci7RUim--8JXyE6GSrxCb_2cl7zx-4TxKFCTdSOYy1geGXp5sFG13k-C0O6z951mfyDhs77qDZdzQ6_-jbq8ze7e_v9vJ3x443yL-j0dl_dhV_r7-_18vfGzv-IP-ORmf_u6v4-_39_V7-_tiZDvLvaHT2n1_97vsHbYe6kLnGFuvSlyfVxYgiweYi1bJUMT4rGdeY5nFb5-oFgdo0b1nzsMqbV1XBv8PMGnZbYfY27FrDnp3s2Wu79rRvTU_t4ekQ6VtreGYnz4aQ76zhuZ08H0JmHWes65C975RFpw-_AwAA___7miUG

query T
SELECT url FROM [EXPLAIN (DISTSQL) SELECT SUM (DISTINCT A), SUM (DISTINCT B) FROM data]
Expand Down
53 changes: 30 additions & 23 deletions pkg/sql/opt/memo/testdata/stats_quality/tpcc
Original file line number Diff line number Diff line change
Expand Up @@ -544,30 +544,35 @@ scalar-group-by
├── stats: [rows=1, distinct(28)=1, null(28)=0]
├── key: ()
├── fd: ()-->(28)
├── inner-join (lookup stock)
│ ├── save-table-name: stock_level_02_lookup_join_2
│ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null)
│ ├── key columns: [3 5] = [12 11]
│ ├── lookup columns are key
│ ├── stats: [rows=216.137889, distinct(1)=19.9995949, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0, distinct(11)=185.570315, null(11)=0, distinct(12)=1, null(12)=0, distinct(13)=30.3089364, null(13)=0]
│ ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3)
│ ├── scan order_line
│ │ ├── save-table-name: stock_level_02_scan_3
│ │ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null)
│ │ ├── constraint: /3/2/-1/4: [/1/1/999 - /1/1/980]
│ │ ├── stats: [rows=185.737555, distinct(1)=20, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0]
│ │ │ histogram(3)= 0 185.74
│ │ │ <---- 1 --
│ │ └── fd: ()-->(2,3)
│ └── filters
│ ├── s_w_id:12 = 1 [type=bool, outer=(12), constraints=(/12: [/1 - /1]; tight), fd=()-->(12)]
│ └── s_quantity:13 < 15 [type=bool, outer=(13), constraints=(/13: (/NULL - /14]; tight)]
├── distinct-on
│ ├── save-table-name: stock_level_02_distinct_on_2
│ ├── columns: s_i_id:11(int!null)
│ ├── grouping columns: s_i_id:11(int!null)
│ ├── stats: [rows=185.570315, distinct(11)=185.570315, null(11)=0]
│ ├── key: (11)
│ └── inner-join (lookup stock)
│ ├── save-table-name: stock_level_02_lookup_join_3
│ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null)
│ ├── key columns: [3 5] = [12 11]
│ ├── lookup columns are key
│ ├── stats: [rows=216.137889, distinct(1)=19.9995949, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0, distinct(11)=185.570315, null(11)=0, distinct(12)=1, null(12)=0, distinct(13)=30.3089364, null(13)=0]
│ ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3)
│ ├── scan order_line
│ │ ├── save-table-name: stock_level_02_scan_4
│ │ ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null)
│ │ ├── constraint: /3/2/-1/4: [/1/1/999 - /1/1/980]
│ │ ├── stats: [rows=185.737555, distinct(1)=20, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0]
│ │ │ histogram(3)= 0 185.74
│ │ │ <---- 1 --
│ │ └── fd: ()-->(2,3)
│ └── filters
│ ├── s_w_id:12 = 1 [type=bool, outer=(12), constraints=(/12: [/1 - /1]; tight), fd=()-->(12)]
│ └── s_quantity:13 < 15 [type=bool, outer=(13), constraints=(/13: (/NULL - /14]; tight)]
└── aggregations
└── agg-distinct [as=count:28, type=int, outer=(11)]
└── count [type=int]
└── s_i_id:11 [type=int]
└── count [as=count:28, type=int, outer=(11)]
└── s_i_id:11 [type=int]

stats table=stock_level_02_scan_3
stats table=stock_level_02_scan_4
----
column_names row_count distinct_count null_count
{ol_d_id} 193 1 0
Expand All @@ -581,7 +586,7 @@ column_names row_count_est row_count_err distinct_count_est distinct_count_e
{ol_o_id} 186.00 1.04 20.00 1.00 0.00 1.00
{ol_w_id} 186.00 1.04 1.00 1.00 0.00 1.00

stats table=stock_level_02_lookup_join_2
stats table=stock_level_02_lookup_join_3
----
column_names row_count distinct_count null_count
{ol_d_id} 15 1 0
Expand All @@ -601,6 +606,8 @@ column_names row_count_est row_count_err distinct_count_est distinct_count_e
{s_quantity} 216.00 14.40 <== 30.00 6.00 <== 0.00 1.00
{s_w_id} 216.00 14.40 <== 1.00 1.00 0.00 1.00

TODO(radu): add stock_level_02_distinct_on_2.

stats table=stock_level_02_scalar_group_by_1
----
column_names row_count distinct_count null_count
Expand Down
11 changes: 11 additions & 0 deletions pkg/sql/opt/norm/custom_funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,11 @@ func (c *CustomFuncs) GroupingCols(grouping *memo.GroupingPrivate) opt.ColSet {
return grouping.GroupingCols
}

// ExtractAggInputColumns returns the set of columns the aggregate depends on.
func (c *CustomFuncs) ExtractAggInputColumns(e opt.ScalarExpr) opt.ColSet {
return memo.ExtractAggInputColumns(e)
}

// IsUnorderedGrouping returns true if the given grouping ordering is not
// specified.
func (c *CustomFuncs) IsUnorderedGrouping(grouping *memo.GroupingPrivate) bool {
Expand Down Expand Up @@ -1906,6 +1911,12 @@ func (c *CustomFuncs) MakeOrderedGrouping(
return &memo.GroupingPrivate{GroupingCols: groupingCols, Ordering: ordering}
}

// MakeUnorderedGrouping constructs a new GroupingPrivate using the given
// grouping columns, but with no ordering on the groups.
func (c CustomFuncs) MakeUnorderedGrouping(groupingCols opt.ColSet) *memo.GroupingPrivate {
return &memo.GroupingPrivate{GroupingCols: groupingCols}
}

// IsLimited indicates whether a limit was pushed under the subquery
// already. See e.g. the rule IntroduceExistsLimit.
func (c *CustomFuncs) IsLimited(sub *memo.SubqueryPrivate) bool {
Expand Down
24 changes: 24 additions & 0 deletions pkg/sql/opt/norm/rules/groupby.opt
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,27 @@
(GroupingCols $groupingPrivate)
$aggregations
)

# PushAggDistinctIntoScalarGroupBy pushes an aggregate function DISTINCT
# modifier into the input of the ScalarGroupBy operator. This allows the
# optimizer to take advantage of an index on the column(s) subject to the
# DISTINCT operation. PushAggDistinctIntoScalarGroupBy can match any single
# aggregate function, including those that have multiple input arguments.
[PushAggDistinctIntoScalarGroupBy, Normalize]
(ScalarGroupBy
$input:*
$aggregations:[
$item:(AggregationsItem (AggDistinct $agg:*) $aggColID:*)
]
$groupingPrivate:*
)
=>
(ScalarGroupBy
(DistinctOn
$input
[]
(MakeUnorderedGrouping (ExtractAggInputColumns $agg))
)
[ (AggregationsItem $agg $aggColID) ]
$groupingPrivate
)
Loading

0 comments on commit fb507a4

Please sign in to comment.