sql: add a rule to push a distinct modifier into a scalargroupby

Previously, the optimizer could not take advantage of an index on a variable with a command like the following: SELECT COUNT(DISTINCT y) FROM xy; To address this, PushAggDistinctIntoScalarGroupBy pushes the distinct operation from the aggregate function and into the input of the ScalarGroupBy. Fixes cockroachdb#46899 Release note: None
DrewKimball · Apr 17, 2020 · fb507a4 · fb507a4
1 parent a63a670
commit fb507a4
Show file tree

Hide file tree

Showing 9 changed files with 389 additions and 84 deletions.
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/aggregate b/pkg/sql/opt/exec/execbuilder/testdata/aggregate
@@ -211,21 +211,23 @@ group      ·            ·                  (count, count, sum, sum, min, min)
 query TTTTT
 EXPLAIN (VERBOSE) SELECT count(DISTINCT a.*) FROM kv a, kv b
 ----
-·                     distributed  false                         ·             ·
-·                     vectorized   false                         ·             ·
-group                 ·            ·                             (count)       ·
- │                    aggregate 0  count(DISTINCT column9)       ·             ·
- │                    scalar       ·                             ·             ·
- └── render           ·            ·                             (column9)     ·
-      │               render 0     ((k, v, w, s) AS k, v, w, s)  ·             ·
-      └── cross-join  ·            ·                             (k, v, w, s)  ·
-           │          type         cross                         ·             ·
-           ├── scan   ·            ·                             (k, v, w, s)  ·
-           │          table        kv@primary                    ·             ·
-           │          spans        FULL SCAN                     ·             ·
-           └── scan   ·            ·                             ()            ·
-·                     table        kv@primary                    ·             ·
-·                     spans        FULL SCAN                     ·             ·
+·                          distributed  false                         ·             ·
+·                          vectorized   false                         ·             ·
+group                      ·            ·                             (count)       ·
+ │                         aggregate 0  count(column9)                ·             ·
+ │                         scalar       ·                             ·             ·
+ └── distinct              ·            ·                             (column9)     ·
+      │                    distinct on  column9                       ·             ·
+      └── render           ·            ·                             (column9)     ·
+           │               render 0     ((k, v, w, s) AS k, v, w, s)  ·             ·
+           └── cross-join  ·            ·                             (k, v, w, s)  ·
+                │          type         cross                         ·             ·
+                ├── scan   ·            ·                             (k, v, w, s)  ·
+                │          table        kv@primary                    ·             ·
+                │          spans        FULL SCAN                     ·             ·
+                └── scan   ·            ·                             ()            ·
+·                          table        kv@primary                    ·             ·
+·                          spans        FULL SCAN                     ·             ·
 
 query TTT
 SELECT tree, field, description FROM [

diff --git a/pkg/sql/opt/exec/execbuilder/testdata/distsql_agg b/pkg/sql/opt/exec/execbuilder/testdata/distsql_agg
@@ -138,7 +138,7 @@ https://cockroachdb.github.io/distsqlplan/decode.html#eJyslEFvmzAYhu_7FdZ3ajUjMJ
 query T
 SELECT url FROM [EXPLAIN (DISTSQL) SELECT SUM (DISTINCT A) FROM data]
 ----
-https://cockroachdb.github.io/distsqlplan/decode.html#eJyslFGLm04Uxd__n2K4T__ABDNqslmfsuymIGSTbUyhsPgwdS5WMI6dGaEl5LsXtSRNyI4S--g4557z88g9gP6RQwDRcrV83pFK5eTTdvNK3pdf31ZP4Zr8_xJGu-jzakT-XNHVvjkL1887wkftdcENj4FCIQWu-R41BO_AgIILFDyg4AOFKcQUSiUT1Fqq-sqhEYTiJwQTCllRVqY-jikkUiEEBzCZyREC2PFvOW6RC1TOBCgINDzLG5tSZXuufi3qDEAhKnmhAzJ2GOGFIIxI8x0VUNhUJiALBvGRgqzM2UsbniIE7Ej753nJtMmKxDjTyzCW-e6H889jq0IqgQrFxdT4eCPBU5oqTLmRymFXXyT68nquaMFGH0byLiKx_hWwPhU4bOy4g0roSHQqYXZnCW5_YrcXsTt2vEHEHYlOxA93Env9ib1exN7Y8QcRdyQ6Ec_vJPb7E_u9iP2xMx1E3JHoRPz4D1bLjflb1KUsNF6tmNuTJ_XqQZFiu6e0rFSCb0omjU37uGl0zYFAbdq3rH0Ii_ZVHfBvMbOK3Qsxuxa7ducOa8-q9u1if0juqVU8szvPhjg_WMVzu_N8iPOjvatJx29i_8muvePjf78DAAD__wbY234=
+https://cockroachdb.github.io/distsqlplan/decode.html#eJyslEFvmzAYhu_7FdZ3ajRHxECahlOqNpMipUkXMmlSxcHDnxgSwcw20qYo_30CpnRUi6EjR4xfnvfhs3wE_SODAMLlevmwJ6XKyKfd9om8LL8-r-9XG3LzuAr34ef1iPzZostDvbbaPOwJHzXbBTc8Agq5FLjhB9QQvAADCi5Q8ICCDxSmEFEolIxRa6mqLcc6sBI_IZhQSPOiNNVyRCGWCiE4gklNhhDAnn_LcIdcoHImQEGg4WlWYwqVHrj6tag6AIWw4LkOyNhhhOeCMCLNd1RAYVuagCwYRCcKsjSvLG14ghCwE-3f5zHVJs1j40zbZRaV9FYJVCisNPci7RUim--8JXyE6GSrxCb_2cl7zx-4TxKFCTdSOYy1geGXp5sFG13k-C0O6z951mfyDhs77qDZdzQ6_-jbq8ze7e_v9vJ3x443yL-j0dl_dhV_r7-_18vfGzv-IP-ORmf_u6v4-_39_V7-_tiZDvLvaHT2n1_97vsHbYe6kLnGFuvSlyfVxYgiweYi1bJUMT4rGdeY5nFb5-oFgdo0b1nzsMqbV1XBv8PMGnZbYfY27FrDnp3s2Wu79rRvTU_t4ekQ6VtreGYnz4aQ76zhuZ08H0JmHWes65C975RFpw-_AwAA___7miUG
 
 query T
 SELECT url FROM [EXPLAIN (DISTSQL) SELECT SUM (DISTINCT A), SUM (DISTINCT B) FROM data]

diff --git a/pkg/sql/opt/memo/testdata/stats_quality/tpcc b/pkg/sql/opt/memo/testdata/stats_quality/tpcc
@@ -544,30 +544,35 @@ scalar-group-by
  ├── stats: [rows=1, distinct(28)=1, null(28)=0]
  ├── key: ()
  ├── fd: ()-->(28)
- ├── inner-join (lookup stock)
- │    ├── save-table-name: stock_level_02_lookup_join_2
- │    ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null)
- │    ├── key columns: [3 5] = [12 11]
- │    ├── lookup columns are key
- │    ├── stats: [rows=216.137889, distinct(1)=19.9995949, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0, distinct(11)=185.570315, null(11)=0, distinct(12)=1, null(12)=0, distinct(13)=30.3089364, null(13)=0]
- │    ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3)
- │    ├── scan order_line
- │    │    ├── save-table-name: stock_level_02_scan_3
- │    │    ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null)
- │    │    ├── constraint: /3/2/-1/4: [/1/1/999 - /1/1/980]
- │    │    ├── stats: [rows=185.737555, distinct(1)=20, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0]
- │    │    │   histogram(3)=  0 185.74
- │    │    │                <---- 1 --
- │    │    └── fd: ()-->(2,3)
- │    └── filters
- │         ├── s_w_id:12 = 1 [type=bool, outer=(12), constraints=(/12: [/1 - /1]; tight), fd=()-->(12)]
- │         └── s_quantity:13 < 15 [type=bool, outer=(13), constraints=(/13: (/NULL - /14]; tight)]
+ ├── distinct-on
+ │    ├── save-table-name: stock_level_02_distinct_on_2
+ │    ├── columns: s_i_id:11(int!null)
+ │    ├── grouping columns: s_i_id:11(int!null)
+ │    ├── stats: [rows=185.570315, distinct(11)=185.570315, null(11)=0]
+ │    ├── key: (11)
+ │    └── inner-join (lookup stock)
+ │         ├── save-table-name: stock_level_02_lookup_join_3
+ │         ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null) s_i_id:11(int!null) s_w_id:12(int!null) s_quantity:13(int!null)
+ │         ├── key columns: [3 5] = [12 11]
+ │         ├── lookup columns are key
+ │         ├── stats: [rows=216.137889, distinct(1)=19.9995949, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0, distinct(11)=185.570315, null(11)=0, distinct(12)=1, null(12)=0, distinct(13)=30.3089364, null(13)=0]
+ │         ├── fd: ()-->(2,3,12), (11)-->(13), (5)==(11), (11)==(5), (3)==(12), (12)==(3)
+ │         ├── scan order_line
+ │         │    ├── save-table-name: stock_level_02_scan_4
+ │         │    ├── columns: ol_o_id:1(int!null) ol_d_id:2(int!null) ol_w_id:3(int!null) ol_i_id:5(int!null)
+ │         │    ├── constraint: /3/2/-1/4: [/1/1/999 - /1/1/980]
+ │         │    ├── stats: [rows=185.737555, distinct(1)=20, null(1)=0, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(5)=185.570315, null(5)=0]
+ │         │    │   histogram(3)=  0 185.74
+ │         │    │                <---- 1 --
+ │         │    └── fd: ()-->(2,3)
+ │         └── filters
+ │              ├── s_w_id:12 = 1 [type=bool, outer=(12), constraints=(/12: [/1 - /1]; tight), fd=()-->(12)]
+ │              └── s_quantity:13 < 15 [type=bool, outer=(13), constraints=(/13: (/NULL - /14]; tight)]
  └── aggregations
-      └── agg-distinct [as=count:28, type=int, outer=(11)]
-           └── count [type=int]
-                └── s_i_id:11 [type=int]
+      └── count [as=count:28, type=int, outer=(11)]
+           └── s_i_id:11 [type=int]
 
-stats table=stock_level_02_scan_3
+stats table=stock_level_02_scan_4
 ----
 column_names  row_count  distinct_count  null_count
 {ol_d_id}     193        1               0
@@ -581,7 +586,7 @@ column_names  row_count_est  row_count_err  distinct_count_est  distinct_count_e
 {ol_o_id}     186.00         1.04           20.00               1.00                0.00            1.00
 {ol_w_id}     186.00         1.04           1.00                1.00                0.00            1.00
 
-stats table=stock_level_02_lookup_join_2
+stats table=stock_level_02_lookup_join_3
 ----
 column_names  row_count  distinct_count  null_count
 {ol_d_id}     15         1               0
@@ -601,6 +606,8 @@ column_names  row_count_est  row_count_err  distinct_count_est  distinct_count_e
 {s_quantity}  216.00         14.40 <==      30.00               6.00 <==            0.00            1.00
 {s_w_id}      216.00         14.40 <==      1.00                1.00                0.00            1.00
 
+TODO(radu): add stock_level_02_distinct_on_2.
+
 stats table=stock_level_02_scalar_group_by_1
 ----
 column_names  row_count  distinct_count  null_count

diff --git a/pkg/sql/opt/norm/custom_funcs.go b/pkg/sql/opt/norm/custom_funcs.go
@@ -1289,6 +1289,11 @@ func (c *CustomFuncs) GroupingCols(grouping *memo.GroupingPrivate) opt.ColSet {
 	return grouping.GroupingCols
 }
 
+// ExtractAggInputColumns returns the set of columns the aggregate depends on.
+func (c *CustomFuncs) ExtractAggInputColumns(e opt.ScalarExpr) opt.ColSet {
+	return memo.ExtractAggInputColumns(e)
+}
+
 // IsUnorderedGrouping returns true if the given grouping ordering is not
 // specified.
 func (c *CustomFuncs) IsUnorderedGrouping(grouping *memo.GroupingPrivate) bool {
@@ -1906,6 +1911,12 @@ func (c *CustomFuncs) MakeOrderedGrouping(
 	return &memo.GroupingPrivate{GroupingCols: groupingCols, Ordering: ordering}
 }
 
+// MakeUnorderedGrouping constructs a new GroupingPrivate using the given
+// grouping columns, but with no ordering on the groups.
+func (c CustomFuncs) MakeUnorderedGrouping(groupingCols opt.ColSet) *memo.GroupingPrivate {
+	return &memo.GroupingPrivate{GroupingCols: groupingCols}
+}
+
 // IsLimited indicates whether a limit was pushed under the subquery
 // already. See e.g. the rule IntroduceExistsLimit.
 func (c *CustomFuncs) IsLimited(sub *memo.SubqueryPrivate) bool {

diff --git a/pkg/sql/opt/norm/rules/groupby.opt b/pkg/sql/opt/norm/rules/groupby.opt
@@ -199,3 +199,27 @@
     (GroupingCols $groupingPrivate)
     $aggregations
 )
+
+# PushAggDistinctIntoScalarGroupBy pushes an aggregate function DISTINCT
+# modifier into the input of the ScalarGroupBy operator. This allows the
+# optimizer to take advantage of an index on the column(s) subject to the
+# DISTINCT operation. PushAggDistinctIntoScalarGroupBy can match any single
+# aggregate function, including those that have multiple input arguments.
+[PushAggDistinctIntoScalarGroupBy, Normalize]
+(ScalarGroupBy
+    $input:*
+    $aggregations:[
+        $item:(AggregationsItem (AggDistinct $agg:*) $aggColID:*)
+    ]
+    $groupingPrivate:*
+)
+=>
+(ScalarGroupBy
+    (DistinctOn
+        $input
+        []
+        (MakeUnorderedGrouping (ExtractAggInputColumns $agg))
+    )
+    [ (AggregationsItem $agg $aggColID) ]
+    $groupingPrivate
+)