This is an automated email from the ASF dual-hosted git repository. yjhjstz pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit ede16fd9b9ae429847be8f3942b0a6bf9a1b574d Author: Aegeaner <[email protected]> AuthorDate: Thu Oct 27 17:08:47 2022 +0800 Bring back cdbsubselect_drop_distinct. (#14322) Previous commit ebb691a03 is to fix ANY sublink contains distinct-on syntax on partial exprs in the target list of subselect. That fix method simply not to drop distinct or order-by clause and might lead to some sublink cannot be pulled up and bad performance, below is an example of such regression: select a from t where a in (select b from t1 group by b) For the above SQL, if we do not drop the group-by clause, then later planner will take the subselect as not simple and fail to pull it up to a join. This commit fixes the regression by: 1. revert the logic of ebb691a03 2. fix the issue #12656 using a correct way: if the distinct on clause's length is not the same as the length of subselect's targetlist, we do not drop the distinct clause. Authored-by: Jian Guo <[email protected]> Co-authored-by: Zhenghua Lyu <[email protected]> --- src/backend/cdb/cdbsubselect.c | 22 +++- src/backend/optimizer/plan/subselect.c | 28 ++--- src/test/regress/expected/notin.out | 34 +++--- src/test/regress/expected/notin_optimizer.out | 34 +++--- src/test/regress/expected/rpt.out | 114 +++++++++------------ src/test/regress/expected/rpt_optimizer.out | 21 ++-- src/test/regress/expected/subselect.out | 10 +- src/test/regress/expected/subselect_gp.out | 35 +++++-- .../regress/expected/subselect_gp_optimizer.out | 64 ++++++++---- src/test/regress/expected/subselect_optimizer.out | 8 +- src/test/regress/sql/subselect_gp.sql | 7 ++ src/test/singlenode_regress/expected/subselect.out | 7 +- .../singlenode_regress/expected/subselect_gp.out | 16 +-- 13 files changed, 213 insertions(+), 187 deletions(-) diff --git a/src/backend/cdb/cdbsubselect.c b/src/backend/cdb/cdbsubselect.c index ae305e1e4b..9622aa2d74 100644 --- a/src/backend/cdb/cdbsubselect.c +++ b/src/backend/cdb/cdbsubselect.c @@ -466,7 +466,9 @@ cdbsubselect_drop_distinct(Query *subselect) subselect->limitOffset == NULL) { /* Delete DISTINCT. */ - subselect->distinctClause = NIL; + if (!subselect->hasDistinctOn || + list_length(subselect->distinctClause) == list_length(subselect->targetList)) + subselect->distinctClause = NIL; /* Delete GROUP BY if subquery has no aggregates and no HAVING. */ if (!subselect->hasAggs && @@ -485,7 +487,9 @@ cdbsubselect_drop_orderby(Query *subselect) subselect->limitOffset == NULL) { /* Delete ORDER BY. */ - subselect->sortClause = NIL; + if (!subselect->hasDistinctOn || + list_length(subselect->distinctClause) == list_length(subselect->targetList)) + subselect->sortClause = NIL; } } /* cdbsubselect_drop_orderby */ @@ -1500,6 +1504,20 @@ convert_IN_to_antijoin(PlannerInfo *root, SubLink *sublink, if (safe_to_convert_NOTIN(root, sublink, available_rels)) { + /* Delete ORDER BY and DISTINCT. + * + * There is no need to do the group-by or order-by inside the + * subquery, if we have decided to pull up the sublink. For the + * group-by case, after the sublink pull-up, there will be a semi-join + * plan node generated in top level, which will weed out duplicate + * tuples naturally. For the order-by case, after the sublink pull-up, + * the subquery will become a jointree, inside which the tuples' order + * doesn't matter. In a summary, it's safe to elimate the group-by or + * order-by causes here. + */ + cdbsubselect_drop_orderby(subselect); + cdbsubselect_drop_distinct(subselect); + int subq_indx = add_notin_subquery_rte(parse, subselect); List *inner_exprs = NIL; List *outer_exprs = NIL; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 4975d2cc6b..538e3da6e8 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1538,6 +1538,20 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, Assert(sublink->subLinkType == ANY_SUBLINK); Assert(IsA(subselect, Query)); + /* Delete ORDER BY and DISTINCT. + * + * There is no need to do the group-by or order-by inside the + * subquery, if we have decided to pull up the sublink. For the + * group-by case, after the sublink pull-up, there will be a semi-join + * plan node generated in top level, which will weed out duplicate + * tuples naturally. For the order-by case, after the sublink pull-up, + * the subquery will become a jointree, inside which the tuples' order + * doesn't matter. In a summary, it's safe to elimate the group-by or + * order-by causes here. + */ + cdbsubselect_drop_orderby(subselect); + cdbsubselect_drop_distinct(subselect); + /* * If uncorrelated, and no Var nodes on lhs, the subquery will be executed * only once. It should become an InitPlan, but make_subplan() doesn't @@ -1548,20 +1562,6 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, if (correlated) { - /* Delete ORDER BY and DISTINCT. - * - * There is no need to do the group-by or order-by inside the - * subquery, if we have decided to pull up the sublink (when the - * subquery is correlated). For the group-by case, after the sublink - * pull-up, there will be a semi-join plan node generated in top - * level, which will weed out duplicate tuples naturally. For the - * order-by case, after the sublink pull-up, the subquery will become - * a jointree, inside which the tuples' order doesn't matter. In a - * summary, it's safe to elimate the group-by or order-by causes here. - */ - cdbsubselect_drop_orderby(subselect); - cdbsubselect_drop_distinct(subselect); - /* * Under certain conditions, we cannot pull up the subquery as a join. */ diff --git a/src/test/regress/expected/notin.out b/src/test/regress/expected/notin.out index 97ce4dce01..7baac477d8 100644 --- a/src/test/regress/expected/notin.out +++ b/src/test/regress/expected/notin.out @@ -400,30 +400,26 @@ select a,b from g1 where (a,b) not in -- explain select x,y from l1 where (x,y) not in (select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=10000000002.35..10000000002.40 rows=3 width=8) + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice3; segments: 3) (cost=10000000008.02..10000000008.03 rows=4 width=8) Merge Key: l1.x, l1.y -> Sort (cost=10000000002.35..10000000002.36 rows=1 width=8) Sort Key: l1.x, l1.y - -> Nested Loop Left Anti Semi (Not-In) Join (cost=10000000001.07..10000000002.34 rows=1 width=8) + -> Nested Loop Left Anti Semi (Not-In) Join (cost=10000000003.25..10000000007.99 rows=2 width=8) Join Filter: ((l1.x = "NotIn_SUBQUERY".y) AND (l1.y = "NotIn_SUBQUERY".sum)) - -> Seq Scan on l1 (cost=0.00..1.03 rows=3 width=8) - -> Materialize (cost=1.09..1.16 rows=3 width=12) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=1.09..1.15 rows=3 width=12) - -> Subquery Scan on "NotIn_SUBQUERY" (cost=1.09..1.11 rows=1 width=12) - -> Unique (cost=1.09..1.10 rows=1 width=16) - Group Key: l1_1.y, (sum(l1_1.x)) - -> Sort (cost=1.09..1.09 rows=1 width=16) - Sort Key: l1_1.y, (sum(l1_1.x)) - -> HashAggregate (cost=1.07..1.08 rows=1 width=16) - Group Key: l1_1.y - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..1.06 rows=1 width=8) - Hash Key: l1_1.y - -> Seq Scan on l1 l1_1 (cost=0.00..1.04 rows=1 width=8) - Filter: (y < 4) + -> Seq Scan on l1 (cost=0.00..3.10 rows=4 width=8) + -> Materialize (cost=3.25..3.47 rows=3 width=12) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=3.25..3.43 rows=3 width=12) + -> Subquery Scan on "NotIn_SUBQUERY" (cost=3.25..3.31 rows=1 width=12) + -> HashAggregate (cost=3.25..3.28 rows=1 width=16) + Group Key: l1_1.y + -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=3.14..3.20 rows=1 width=12) + Hash Key: l1_1.y + -> Seq Scan on l1 l1_1 (cost=0.00..3.12 rows=2 width=8) + Filter: (y < 4) Optimizer: Postgres query optimizer -(21 rows) +(17 rows) select x,y from l1 where (x,y) not in (select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2; diff --git a/src/test/regress/expected/notin_optimizer.out b/src/test/regress/expected/notin_optimizer.out index 02a114d083..46052b7e4f 100644 --- a/src/test/regress/expected/notin_optimizer.out +++ b/src/test/regress/expected/notin_optimizer.out @@ -407,30 +407,26 @@ select a,b from g1 where (a,b) not in -- explain select x,y from l1 where (x,y) not in (select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=10000000002.35..10000000002.40 rows=3 width=8) + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice3; segments: 3) (cost=10000000008.02..10000000008.03 rows=4 width=8) Merge Key: l1.x, l1.y -> Sort (cost=10000000002.35..10000000002.36 rows=1 width=8) Sort Key: l1.x, l1.y - -> Nested Loop Left Anti Semi (Not-In) Join (cost=10000000001.07..10000000002.34 rows=1 width=8) + -> Nested Loop Left Anti Semi (Not-In) Join (cost=10000000003.25..10000000007.99 rows=2 width=8) Join Filter: ((l1.x = "NotIn_SUBQUERY".y) AND (l1.y = "NotIn_SUBQUERY".sum)) - -> Seq Scan on l1 (cost=0.00..1.03 rows=3 width=8) - -> Materialize (cost=1.09..1.16 rows=3 width=12) - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=1.09..1.15 rows=3 width=12) - -> Subquery Scan on "NotIn_SUBQUERY" (cost=1.09..1.11 rows=1 width=12) - -> Unique (cost=1.09..1.10 rows=1 width=16) - Group Key: l1_1.y, (sum(l1_1.x)) - -> Sort (cost=1.09..1.09 rows=1 width=16) - Sort Key: l1_1.y, (sum(l1_1.x)) - -> HashAggregate (cost=1.07..1.08 rows=1 width=16) - Group Key: l1_1.y - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..1.06 rows=1 width=8) - Hash Key: l1_1.y - -> Seq Scan on l1 l1_1 (cost=0.00..1.04 rows=1 width=8) - Filter: (y < 4) + -> Seq Scan on l1 (cost=0.00..3.10 rows=4 width=8) + -> Materialize (cost=3.25..3.47 rows=3 width=12) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=3.25..3.43 rows=3 width=12) + -> Subquery Scan on "NotIn_SUBQUERY" (cost=3.25..3.31 rows=1 width=12) + -> HashAggregate (cost=3.25..3.28 rows=1 width=16) + Group Key: l1_1.y + -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=3.14..3.20 rows=1 width=12) + Hash Key: l1_1.y + -> Seq Scan on l1 l1_1 (cost=0.00..3.12 rows=2 width=8) + Filter: (y < 4) Optimizer: Postgres query optimizer -(21 rows) +(19 rows) select x,y from l1 where (x,y) not in (select distinct y, sum(x) from l1 group by y having y < 4 order by y) order by 1,2; diff --git a/src/test/regress/expected/rpt.out b/src/test/regress/expected/rpt.out index 367ebd5b40..389a5f63c0 100644 --- a/src/test/regress/expected/rpt.out +++ b/src/test/regress/expected/rpt.out @@ -1104,15 +1104,13 @@ explain select b from dist_tab where b in (select distinct c from rep_tab); --------------------------------------------------------------------------------------------------- Gather Motion 3:1 (slice1; segments: 3) (cost=10000000001.16..10000000021.44 rows=4 width=4) -> Nested Loop (cost=10000000001.16..10000000021.39 rows=1 width=4) - -> Unique (cost=10000000001.03..10000000001.04 rows=2 width=4) + -> HashAggregate (cost=10000000001.02..10000000001.03 rows=2 width=4) Group Key: rep_tab.c - -> Sort (cost=10000000001.03..10000000001.03 rows=2 width=4) - Sort Key: rep_tab.c - -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) -> Index Only Scan using idx on dist_tab (cost=0.13..10.16 rows=1 width=4) Index Cond: (b = rep_tab.c) Optimizer: Postgres query optimizer -(10 rows) +(8 rows) select b from dist_tab where b in (select distinct c from rep_tab); b @@ -1135,20 +1133,16 @@ analyze rand_tab; -- -- join derives EdtHashed explain select c from rep_tab where c in (select distinct c from rep_tab); - QUERY PLAN -------------------------------------------------------------------------------------------------------------------- - Gather Motion 1:1 (slice1; segments: 1) (cost=20000000002.14..20000000002.14 rows=3 width=4) - -> Hash Join (cost=20000000001.08..20000000002.14 rows=3 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=20000000001.05..20000000002.11 rows=4 width=4) + -> Hash Semi Join (cost=20000000001.05..20000000002.11 rows=4 width=4) Hash Cond: (rep_tab.c = rep_tab_1.c) -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) - -> Hash (cost=10000000001.06..10000000001.06 rows=2 width=4) - -> Unique (cost=10000000001.03..10000000001.04 rows=2 width=4) - Group Key: rep_tab_1.c - -> Sort (cost=10000000001.03..10000000001.03 rows=2 width=4) - Sort Key: rep_tab_1.c - -> Seq Scan on rep_tab rep_tab_1 (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Hash (cost=10000000001.02..10000000001.02 rows=1 width=4) + -> Seq Scan on rep_tab rep_tab_1 (cost=10000000000.00..10000000001.02 rows=2 width=4) Optimizer: Postgres query optimizer -(11 rows) +(7 rows) select c from rep_tab where c in (select distinct c from rep_tab); c @@ -1163,20 +1157,16 @@ select c from rep_tab where c in (select distinct c from rep_tab); -- -- join derives EdtHashed explain select a from dist_tab where a in (select distinct c from rep_tab); - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.08..20000000002.17 rows=4 width=4) - -> Hash Join (cost=20000000001.08..20000000002.12 rows=1 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.05..20000000003.14 rows=4 width=4) + -> Hash Semi Join (cost=20000000001.05..20000000003.14 rows=2 width=4) Hash Cond: (dist_tab.a = rep_tab.c) - -> Seq Scan on dist_tab (cost=10000000000.00..10000000001.01 rows=1 width=4) - -> Hash (cost=10000000001.06..10000000001.06 rows=2 width=4) - -> Unique (cost=10000000001.03..10000000001.04 rows=2 width=4) - Group Key: rep_tab.c - -> Sort (cost=10000000001.03..10000000001.03 rows=2 width=4) - Sort Key: rep_tab.c - -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Seq Scan on dist_tab (cost=10000000000.00..10000000002.04 rows=2 width=4) + -> Hash (cost=10000000001.02..10000000001.02 rows=1 width=4) + -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) Optimizer: Postgres query optimizer -(11 rows) +(7 rows) select a from dist_tab where a in (select distinct c from rep_tab); a @@ -1193,20 +1183,16 @@ select a from dist_tab where a in (select distinct c from rep_tab); -- -- join derives EdtRandom explain select d from rand_tab where d in (select distinct c from rep_tab); - QUERY PLAN ---------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.08..20000000002.15 rows=3 width=4) - -> Hash Join (cost=20000000001.08..20000000002.11 rows=1 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.05..20000000002.11 rows=4 width=4) + -> Hash Semi Join (cost=20000000001.05..20000000002.11 rows=2 width=4) Hash Cond: (rand_tab.d = rep_tab.c) - -> Seq Scan on rand_tab (cost=10000000000.00..10000000001.01 rows=1 width=4) - -> Hash (cost=10000000001.06..10000000001.06 rows=2 width=4) - -> Unique (cost=10000000001.03..10000000001.04 rows=2 width=4) - Group Key: rep_tab.c - -> Sort (cost=10000000001.03..10000000001.03 rows=2 width=4) - Sort Key: rep_tab.c - -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Seq Scan on rand_tab (cost=10000000000.00..10000000001.02 rows=1 width=4) + -> Hash (cost=10000000001.02..10000000001.02 rows=1 width=4) + -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) Optimizer: Postgres query optimizer -(11 rows) +(7 rows) select d from rand_tab where d in (select distinct c from rep_tab); d @@ -1221,20 +1207,18 @@ select d from rand_tab where d in (select distinct c from rep_tab); -- -- join derives EdtHashed explain select c from rep_tab where c in (select distinct a from dist_tab); - QUERY PLAN ----------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.05..20000000002.14 rows=3 width=4) - -> Hash Join (cost=20000000001.05..20000000002.09 rows=1 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.03..20000000002.14 rows=3 width=4) + -> Hash Semi Join (cost=20000000001.03..20000000002.09 rows=1 width=4) Hash Cond: (rep_tab.c = dist_tab.a) - -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) - -> Hash (cost=10000000001.04..10000000001.04 rows=1 width=4) - -> Unique (cost=10000000001.02..10000000001.03 rows=1 width=4) - Group Key: dist_tab.a - -> Sort (cost=10000000001.02..10000000001.03 rows=1 width=4) - Sort Key: dist_tab.a - -> Seq Scan on dist_tab (cost=10000000000.00..10000000001.01 rows=1 width=4) + -> Redistribute Motion 1:3 (slice2; segments: 1) (cost=10000000000.00..10000000001.05 rows=1 width=4) + Hash Key: rep_tab.c + -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Hash (cost=10000000001.01..10000000001.01 rows=1 width=4) + -> Seq Scan on dist_tab (cost=10000000000.00..10000000001.01 rows=1 width=4) Optimizer: Postgres query optimizer -(11 rows) +(9 rows) select c from rep_tab where c in (select distinct a from dist_tab); c @@ -1249,22 +1233,20 @@ select c from rep_tab where c in (select distinct a from dist_tab); -- -- join derives EdtHashed explain select c from rep_tab where c in (select distinct d from rand_tab); - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.07..20000000002.15 rows=3 width=4) - -> Hash Join (cost=20000000001.07..20000000002.11 rows=1 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=20000000001.04..20000000002.15 rows=3 width=4) + -> Hash Semi Join (cost=20000000001.04..20000000002.11 rows=1 width=4) Hash Cond: (rep_tab.c = rand_tab.d) - -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) - -> Hash (cost=10000000001.06..10000000001.06 rows=1 width=4) - -> Unique (cost=10000000001.04..10000000001.05 rows=1 width=4) - Group Key: rand_tab.d - -> Sort (cost=10000000001.04..10000000001.05 rows=1 width=4) - Sort Key: rand_tab.d - -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=10000000000.00..10000000001.03 rows=1 width=4) - Hash Key: rand_tab.d - -> Seq Scan on rand_tab (cost=10000000000.00..10000000001.01 rows=1 width=4) + -> Redistribute Motion 1:3 (slice2; segments: 1) (cost=10000000000.00..10000000001.05 rows=1 width=4) + Hash Key: rep_tab.c + -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Hash (cost=10000000001.03..10000000001.03 rows=1 width=4) + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=10000000000.00..10000000001.03 rows=1 width=4) + Hash Key: rand_tab.d + -> Seq Scan on rand_tab (cost=10000000000.00..10000000001.01 rows=1 width=4) Optimizer: Postgres query optimizer -(13 rows) +(11 rows) select c from rep_tab where c in (select distinct d from rand_tab); c diff --git a/src/test/regress/expected/rpt_optimizer.out b/src/test/regress/expected/rpt_optimizer.out index 62a9fe489a..5585b29b56 100644 --- a/src/test/regress/expected/rpt_optimizer.out +++ b/src/test/regress/expected/rpt_optimizer.out @@ -1092,20 +1092,17 @@ set optimizer_enable_hashjoin=off; set enable_hashjoin=off; set enable_nestloop=on; explain select b from dist_tab where b in (select distinct c from rep_tab); - QUERY PLAN ---------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..443.00 rows=4 width=4) - -> Nested Loop (cost=0.00..443.00 rows=2 width=4) - Join Filter: true - -> GroupAggregate (cost=0.00..431.00 rows=2 width=4) + QUERY PLAN +------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) (cost=10000000001.15..10000000021.42 rows=4 width=4) + -> Nested Loop (cost=10000000001.15..10000000021.36 rows=1 width=4) + -> HashAggregate (cost=10000000001.02..10000000001.03 rows=2 width=4) Group Key: rep_tab.c - -> Sort (cost=0.00..431.00 rows=2 width=4) - Sort Key: rep_tab.c - -> Seq Scan on rep_tab (cost=0.00..431.00 rows=2 width=4) - -> Index Scan using idx on dist_tab (cost=0.00..12.00 rows=1 width=4) + -> Seq Scan on rep_tab (cost=10000000000.00..10000000001.02 rows=2 width=4) + -> Index Only Scan using idx on dist_tab (cost=0.13..10.16 rows=1 width=4) Index Cond: (b = rep_tab.c) - Optimizer: Pivotal Optimizer (GPORCA) -(11 rows) + Optimizer: Postgres query optimizer +(8 rows) select b from dist_tab where b in (select distinct c from rep_tab); b diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 427fdf060d..f4517c9b58 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1358,16 +1358,10 @@ select * from int4_tbl o where (f1, f1) in Output: i.f1, ((generate_series(1, 50)) / 10) -> ProjectSet Output: generate_series(1, 50), i.f1 - -> GroupAggregate + -> Seq Scan on public.int4_tbl i Output: i.f1 - Group Key: i.f1 - -> Sort - Output: i.f1 - Sort Key: i.f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1 Optimizer: Postgres query optimizer -(25 rows) +(20 rows) select * from int4_tbl o where (f1, f1) in (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1); diff --git a/src/test/regress/expected/subselect_gp.out b/src/test/regress/expected/subselect_gp.out index 3dc5dcdfa7..9aff68f249 100644 --- a/src/test/regress/expected/subselect_gp.out +++ b/src/test/regress/expected/subselect_gp.out @@ -1763,11 +1763,9 @@ EXPLAIN select count(*) from Group Key: b.hundred -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..389.00 rows=3334 width=4) Hash Key: b.hundred - -> HashAggregate (cost=70.67..71.67 rows=100 width=4) - Group Key: b.hundred -> Seq Scan on tenk1 b (cost=0.00..62.33 rows=3333 width=4) Optimizer: Postgres query optimizer -(15 rows) +(13 rows) EXPLAIN select count(distinct ss.ten) from (select ten from tenk1 a @@ -1791,9 +1789,7 @@ EXPLAIN select count(distinct ss.ten) from Group Key: b.hundred -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..389.00 rows=3334 width=4) Hash Key: b.hundred - -> HashAggregate (cost=70.67..71.67 rows=100 width=4) - Group Key: b.hundred - -> Seq Scan on tenk1 b (cost=0.00..62.33 rows=3333 width=4) + -> Seq Scan on tenk1 b (cost=0.00..62.33 rows=3333 width=4) Optimizer: Postgres query optimizer (19 rows) @@ -3053,6 +3049,33 @@ select * from issue_12656 where (i, j) in (select distinct on (i) i, j from issu 1 | 10002 (1 row) +-- case 3, check correlated DISTINCT ON +explain select * from issue_12656 a where (i, j) in +(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j asc); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..895.00 rows=43050 width=8) + -> Seq Scan on issue_12656 a (cost=0.00..321.00 rows=14350 width=8) + Filter: (SubPlan 1) + SubPlan 1 + -> Unique (cost=9818.00..10033.25 rows=1000 width=8) + -> Sort (cost=9818.00..10033.25 rows=86100 width=8) + Sort Key: b.j + -> Result (cost=0.00..2760.50 rows=86100 width=8) + Filter: (a.i = b.i) + -> Materialize (cost=0.00..1899.50 rows=86100 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1469.00 rows=86100 width=8) + -> Seq Scan on issue_12656 b (cost=0.00..321.00 rows=28700 width=8) + Optimizer: Postgres query optimizer +(13 rows) + +select * from issue_12656 a where (i, j) in +(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j asc); + i | j +---+------- + 1 | 10001 +(1 row) + --- --- Test param info is preserved when bringing a path to OuterQuery locus --- diff --git a/src/test/regress/expected/subselect_gp_optimizer.out b/src/test/regress/expected/subselect_gp_optimizer.out index e9a83655cd..a8bca6365f 100644 --- a/src/test/regress/expected/subselect_gp_optimizer.out +++ b/src/test/regress/expected/subselect_gp_optimizer.out @@ -1775,27 +1775,22 @@ EXPLAIN select count(distinct ss.ten) from EXPLAIN select count(*) from (select 1 from tenk1 a where unique1 IN (select distinct hundred from tenk1 b)) ss; - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------------- - Finalize Aggregate (cost=0.00..431.98 rows=1 width=8) - -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.98 rows=1 width=8) - -> Partial Aggregate (cost=0.00..431.98 rows=1 width=8) - -> Nested Loop (cost=0.00..431.98 rows=34 width=1) - Join Filter: true - -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.94 rows=100 width=4) - -> GroupAggregate (cost=0.00..431.94 rows=34 width=4) - Group Key: tenk1.hundred - -> Sort (cost=0.00..431.94 rows=34 width=4) - Sort Key: tenk1.hundred - -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.94 rows=34 width=4) - Hash Key: tenk1.hundred - -> Streaming HashAggregate (cost=0.00..431.94 rows=34 width=4) - Group Key: tenk1.hundred - -> Seq Scan on tenk1 (cost=0.00..431.51 rows=3334 width=4) - -> Index Only Scan using tenk1_unique1 on tenk1 tenk1_1 (cost=0.00..0.04 rows=1 width=1) - Index Cond: (unique1 = tenk1.hundred) - Optimizer: Pivotal Optimizer (GPORCA) version 2.75.0 -(17 rows) + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Finalize Aggregate (cost=210.51..210.52 rows=1 width=8) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=210.45..210.50 rows=3 width=8) + -> Partial Aggregate (cost=210.45..210.46 rows=1 width=8) + -> Hash Join (cost=138.92..210.37 rows=33 width=0) + Hash Cond: (a.unique1 = b.hundred) + -> Seq Scan on tenk1 a (cost=0.00..62.33 rows=3333 width=4) + -> Hash (cost=137.67..137.67 rows=100 width=4) + -> HashAggregate (cost=137.33..137.67 rows=100 width=4) + Group Key: b.hundred + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..129.00 rows=3333 width=4) + Hash Key: b.hundred + -> Seq Scan on tenk1 b (cost=0.00..62.33 rows=3333 width=4) + Optimizer: Postgres query optimizer +(13 rows) EXPLAIN select count(distinct ss.ten) from (select ten from tenk1 a @@ -3146,6 +3141,33 @@ select * from issue_12656 where (i, j) in (select distinct on (i) i, j from issu 1 | 10002 (1 row) +-- case 3, check correlated DISTINCT ON +explain select * from issue_12656 a where (i, j) in +(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j asc); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..895.00 rows=43050 width=8) + -> Seq Scan on issue_12656 a (cost=0.00..321.00 rows=14350 width=8) + Filter: (SubPlan 1) + SubPlan 1 + -> Unique (cost=9818.00..10033.25 rows=1000 width=8) + -> Sort (cost=9818.00..10033.25 rows=86100 width=8) + Sort Key: b.j + -> Result (cost=0.00..2760.50 rows=86100 width=8) + Filter: (a.i = b.i) + -> Materialize (cost=0.00..1899.50 rows=86100 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1469.00 rows=86100 width=8) + -> Seq Scan on issue_12656 b (cost=0.00..321.00 rows=28700 width=8) + Optimizer: Postgres query optimizer +(13 rows) + +select * from issue_12656 a where (i, j) in +(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j asc); + i | j +---+------- + 1 | 10001 +(1 row) + --- --- Test param info is preserved when bringing a path to OuterQuery locus --- diff --git a/src/test/regress/expected/subselect_optimizer.out b/src/test/regress/expected/subselect_optimizer.out index 3855edf1b9..c56ad50a3b 100644 --- a/src/test/regress/expected/subselect_optimizer.out +++ b/src/test/regress/expected/subselect_optimizer.out @@ -1410,14 +1410,8 @@ select * from int4_tbl o where (f1, f1) in Output: i.f1, ((generate_series(1, 50)) / 10) -> ProjectSet Output: generate_series(1, 50), i.f1 - -> GroupAggregate + -> Seq Scan on public.int4_tbl i Output: i.f1 - Group Key: i.f1 - -> Sort - Output: i.f1 - Sort Key: i.f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1 Optimizer: Postgres query optimizer (19 rows) diff --git a/src/test/regress/sql/subselect_gp.sql b/src/test/regress/sql/subselect_gp.sql index f83ddf6456..bdc4346274 100644 --- a/src/test/regress/sql/subselect_gp.sql +++ b/src/test/regress/sql/subselect_gp.sql @@ -1209,6 +1209,13 @@ select * from issue_12656 where (i, j) in (select distinct on (i) i, j from issu select * from issue_12656 where (i, j) in (select distinct on (i) i, j from issue_12656 order by i, j desc); +-- case 3, check correlated DISTINCT ON +explain select * from issue_12656 a where (i, j) in +(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j asc); + +select * from issue_12656 a where (i, j) in +(select distinct on (i) i, j from issue_12656 b where a.i=b.i order by i, j asc); + --- --- Test param info is preserved when bringing a path to OuterQuery locus --- diff --git a/src/test/singlenode_regress/expected/subselect.out b/src/test/singlenode_regress/expected/subselect.out index 54a2ba10ac..d2b4222e2d 100644 --- a/src/test/singlenode_regress/expected/subselect.out +++ b/src/test/singlenode_regress/expected/subselect.out @@ -1356,13 +1356,10 @@ select * from int4_tbl o where (f1, f1) in Output: i.f1, ((generate_series(1, 50)) / 10) -> ProjectSet Output: generate_series(1, 50), i.f1 - -> HashAggregate + -> Seq Scan on public.int4_tbl i Output: i.f1 - Group Key: i.f1 - -> Seq Scan on public.int4_tbl i - Output: i.f1 Optimizer: Postgres query optimizer -(20 rows) +(17 rows) select * from int4_tbl o where (f1, f1) in (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1); diff --git a/src/test/singlenode_regress/expected/subselect_gp.out b/src/test/singlenode_regress/expected/subselect_gp.out index 83cbeb2f5e..7538ea4b4d 100644 --- a/src/test/singlenode_regress/expected/subselect_gp.out +++ b/src/test/singlenode_regress/expected/subselect_gp.out @@ -1512,12 +1512,12 @@ EXPLAIN select count(*) from where unique1 IN (select distinct hundred from tenk1 b)) ss; QUERY PLAN ---------------------------------------------------------------------------------------------------------------- - Aggregate (cost=410.03..410.04 rows=1 width=8) - -> Hash Join (cost=197.53..409.78 rows=100 width=0) + Aggregate (cost=410.15..410.16 rows=1 width=8) + -> Hash Join (cost=197.53..409.90 rows=100 width=0) Hash Cond: (a.unique1 = b.hundred) -> Seq Scan on tenk1 a (cost=0.00..185.00 rows=10000 width=4) -> Hash (cost=196.28..196.28 rows=100 width=4) - -> Unique (cost=0.29..195.28 rows=100 width=4) + -> HashAggregate (cost=195.28..196.28 rows=100 width=4) Group Key: b.hundred -> Index Only Scan using tenk1_hundred on tenk1 b (cost=0.29..170.28 rows=10000 width=4) Optimizer: Postgres query optimizer @@ -1528,12 +1528,12 @@ EXPLAIN select count(distinct ss.ten) from where unique1 IN (select distinct hundred from tenk1 b)) ss; QUERY PLAN ---------------------------------------------------------------------------------------------------------------- - Aggregate (cost=410.03..410.04 rows=1 width=8) - -> Hash Join (cost=197.53..409.78 rows=100 width=4) + Aggregate (cost=410.15..410.16 rows=1 width=8) + -> Hash Join (cost=197.53..409.90 rows=100 width=4) Hash Cond: (a.unique1 = b.hundred) -> Seq Scan on tenk1 a (cost=0.00..185.00 rows=10000 width=8) -> Hash (cost=196.28..196.28 rows=100 width=4) - -> Unique (cost=0.29..195.28 rows=100 width=4) + -> HashAggregate (cost=195.28..196.28 rows=100 width=4) Group Key: b.hundred -> Index Only Scan using tenk1_hundred on tenk1 b (cost=0.29..170.28 rows=10000 width=4) Optimizer: Postgres query optimizer @@ -2187,7 +2187,7 @@ select * from foo left outer join baz on (select bar.i from bar where bar.i = fo Output: bar.i Filter: (bar.i = foo.i) Optimizer: Postgres query optimizer -(15 rows) +(14 rows) select * from foo left outer join baz on (select bar.i from bar where bar.i = foo.i) + 1 = baz.j; i | j | i | j @@ -2230,7 +2230,7 @@ select * from foo where -> Seq Scan on subselect_gp.baz b Output: b.i Optimizer: Postgres query optimizer -(14 rows) +(13 rows) select * from foo where (case when foo.i in (select a.i from baz a) then foo.i else null end) in --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
