This is an automated email from the ASF dual-hosted git repository. yjhjstz pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 8e3de84113e06200c1096e85a21e6a5cc41c533b Author: Aegeaner <[email protected]> AuthorDate: Wed Oct 19 11:50:05 2022 +0800 Bring back cdbsubselect_drop_distinct from 5X branch. (#14258) In GP's convert_ANY_sublink_to_join() now, there is no simplification of origin subquery, as a result sometimes the sublink cannot be pulled up, which produced a bad query plan. In GP5, there is a subquery simplify step which helped sublink pull-up, but we lose this simplification since version 6X until master, which caused the sublink pull-up doesn't work as only simple subquery is allowed to be pulled up. By this commit we brought back cdbsubselect_drop_distinct and cdbsubselect_drop_orderby, then we can got simplified subquery again and do the sublink pull-up optimization on it which produced a better plan. --- src/backend/cdb/cdbsubselect.c | 31 +++++++++ src/backend/optimizer/plan/subselect.c | 14 ++++ src/include/cdb/cdbsubselect.h | 3 + src/test/regress/expected/subselect_gp.out | 71 ++++++++++++++++++++ .../regress/expected/subselect_gp_optimizer.out | 75 ++++++++++++++++++++++ src/test/regress/sql/subselect_gp.sql | 18 ++++++ 6 files changed, 212 insertions(+) diff --git a/src/backend/cdb/cdbsubselect.c b/src/backend/cdb/cdbsubselect.c index 6180441c3e..ae305e1e4b 100644 --- a/src/backend/cdb/cdbsubselect.c +++ b/src/backend/cdb/cdbsubselect.c @@ -456,7 +456,38 @@ SubqueryToJoinWalker(Node *node, ConvertSubqueryToJoinContext *context) return; } +/* + * cdbsubselect_drop_distinct + */ +void +cdbsubselect_drop_distinct(Query *subselect) +{ + if (subselect->limitCount == NULL && + subselect->limitOffset == NULL) + { + /* Delete DISTINCT. */ + subselect->distinctClause = NIL; + + /* Delete GROUP BY if subquery has no aggregates and no HAVING. */ + if (!subselect->hasAggs && + subselect->havingQual == NULL) + subselect->groupClause = NIL; + } +} /* cdbsubselect_drop_distinct */ +/* + * cdbsubselect_drop_orderby + */ +void +cdbsubselect_drop_orderby(Query *subselect) +{ + if (subselect->limitCount == NULL && + subselect->limitOffset == NULL) + { + /* Delete ORDER BY. */ + subselect->sortClause = NIL; + } +} /* cdbsubselect_drop_orderby */ /** * Safe to convert expr sublink to a join diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 41ecf8dbb1..4975d2cc6b 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1548,6 +1548,20 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, if (correlated) { + /* Delete ORDER BY and DISTINCT. + * + * There is no need to do the group-by or order-by inside the + * subquery, if we have decided to pull up the sublink (when the + * subquery is correlated). For the group-by case, after the sublink + * pull-up, there will be a semi-join plan node generated in top + * level, which will weed out duplicate tuples naturally. For the + * order-by case, after the sublink pull-up, the subquery will become + * a jointree, inside which the tuples' order doesn't matter. In a + * summary, it's safe to elimate the group-by or order-by causes here. + */ + cdbsubselect_drop_orderby(subselect); + cdbsubselect_drop_distinct(subselect); + /* * Under certain conditions, we cannot pull up the subquery as a join. */ diff --git a/src/include/cdb/cdbsubselect.h b/src/include/cdb/cdbsubselect.h index 5cf3f85442..41df4f539c 100644 --- a/src/include/cdb/cdbsubselect.h +++ b/src/include/cdb/cdbsubselect.h @@ -24,4 +24,7 @@ extern bool is_simple_subquery(PlannerInfo *root, Query *subquery, RangeTblEntry JoinExpr *lowest_outer_join); extern JoinExpr *convert_IN_to_antijoin(PlannerInfo *root, SubLink *sublink, Relids available_rels); +extern void cdbsubselect_drop_orderby(Query *subselect); +extern void cdbsubselect_drop_distinct(Query *subselect); + #endif /* CDBSUBSELECT_H */ diff --git a/src/test/regress/expected/subselect_gp.out b/src/test/regress/expected/subselect_gp.out index 2995f52a6d..3dc5dcdfa7 100644 --- a/src/test/regress/expected/subselect_gp.out +++ b/src/test/regress/expected/subselect_gp.out @@ -1626,6 +1626,77 @@ EXPLAIN SELECT '' AS five, f1 AS "Correlated Field" Optimizer: Postgres query optimizer (13 rows) +-- Test simplify group-by/order-by inside subquery if sublink pull-up is possible +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 GROUP BY f2); + QUERY PLAN +------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=1.05..2.13 rows=3 width=40) + -> Hash Semi Join (cost=1.05..2.09 rows=1 width=40) + Hash Cond: (upper.f1 = subselect_tbl.f1) + -> Seq Scan on subselect_tbl upper (cost=0.00..1.03 rows=3 width=8) + -> Hash (cost=1.03..1.03 rows=1 width=8) + -> Seq Scan on subselect_tbl (cost=0.00..1.03 rows=1 width=8) + Filter: (f1 = f2) + Optimizer: Postgres query optimizer +(8 rows) + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 GROUP BY f2 LIMIT 3); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1.08 rows=4 width=40) + -> Seq Scan on subselect_tbl upper (cost=0.00..1.03 rows=1 width=40) + Filter: (SubPlan 1) + SubPlan 1 + -> Limit (cost=1.27..1.30 rows=3 width=4) + -> HashAggregate (cost=1.27..1.33 rows=6 width=4) + Group Key: subselect_tbl.f2 + -> Result (cost=0.00..1.25 rows=8 width=4) + Filter: (subselect_tbl.f1 = upper.f1) + -> Materialize (cost=0.00..1.17 rows=8 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1.13 rows=8 width=8) + -> Seq Scan on subselect_tbl (cost=0.00..1.03 rows=3 width=8) + Optimizer: Postgres query optimizer +(13 rows) + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 ORDER BY f2); + QUERY PLAN +------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=1.05..2.13 rows=3 width=40) + -> Hash Semi Join (cost=1.05..2.09 rows=1 width=40) + Hash Cond: (upper.f1 = subselect_tbl.f1) + -> Seq Scan on subselect_tbl upper (cost=0.00..1.03 rows=3 width=8) + -> Hash (cost=1.03..1.03 rows=1 width=8) + -> Seq Scan on subselect_tbl (cost=0.00..1.03 rows=1 width=8) + Filter: (f1 = f2) + Optimizer: Postgres query optimizer +(8 rows) + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 ORDER BY f2 LIMIT 3); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1.08 rows=4 width=40) + -> Seq Scan on subselect_tbl upper (cost=0.00..1.03 rows=1 width=40) + Filter: (SubPlan 1) + SubPlan 1 + -> Limit (cost=1.36..1.36 rows=3 width=4) + -> Sort (cost=1.36..1.38 rows=8 width=4) + Sort Key: subselect_tbl.f2 + -> Result (cost=0.00..1.25 rows=8 width=4) + Filter: (subselect_tbl.f1 = upper.f1) + -> Materialize (cost=0.00..1.17 rows=8 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..1.13 rows=8 width=8) + -> Seq Scan on subselect_tbl (cost=0.00..1.03 rows=3 width=8) + Optimizer: Postgres query optimizer +(13 rows) + -- -- Test cases to catch unpleasant interactions between IN-join processing -- and subquery pullup. diff --git a/src/test/regress/expected/subselect_gp_optimizer.out b/src/test/regress/expected/subselect_gp_optimizer.out index 8aba40fb65..e9a83655cd 100644 --- a/src/test/regress/expected/subselect_gp_optimizer.out +++ b/src/test/regress/expected/subselect_gp_optimizer.out @@ -1642,6 +1642,81 @@ EXPLAIN SELECT '' AS five, f1 AS "Correlated Field" Optimizer: Postgres query optimizer (13 rows) +-- Test simplify group-by/order-by inside subquery if sublink pull-up is possible +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 GROUP BY f2); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Result (cost=0.00..862.00 rows=8 width=16) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=8 width=8) + -> Hash Semi Join (cost=0.00..862.00 rows=3 width=8) + Hash Cond: ((subselect_tbl.f1 = subselect_tbl_1.f1) AND (subselect_tbl.f1 = subselect_tbl_1.f2)) + -> Seq Scan on subselect_tbl (cost=0.00..431.00 rows=3 width=8) + -> Hash (cost=431.00..431.00 rows=3 width=8) + -> Seq Scan on subselect_tbl subselect_tbl_1 (cost=0.00..431.00 rows=3 width=8) + Optimizer: Pivotal Optimizer (GPORCA) +(8 rows) + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 GROUP BY f2 LIMIT 3); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------------- + Result (cost=0.00..1324038.55 rows=8 width=16) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324038.55 rows=8 width=8) + -> Seq Scan on subselect_tbl (cost=0.00..1324038.55 rows=3 width=8) + Filter: (SubPlan 1) + SubPlan 1 + -> Limit (cost=0.00..431.01 rows=1 width=4) + -> GroupAggregate (cost=0.00..431.01 rows=1 width=4) + Group Key: subselect_tbl_1.f2 + -> Sort (cost=0.00..431.01 rows=1 width=4) + Sort Key: subselect_tbl_1.f2 + -> Result (cost=0.00..431.01 rows=1 width=4) + Filter: (subselect_tbl_1.f1 = subselect_tbl.f1) + -> Materialize (cost=0.00..431.00 rows=8 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=8 width=8) + -> Seq Scan on subselect_tbl subselect_tbl_1 (cost=0.00..431.00 rows=3 width=8) + Optimizer: Pivotal Optimizer (GPORCA) +(16 rows) + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 ORDER BY f2); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------- + Result (cost=0.00..862.00 rows=8 width=16) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.00 rows=8 width=8) + -> Hash Semi Join (cost=0.00..862.00 rows=3 width=8) + Hash Cond: ((subselect_tbl.f1 = subselect_tbl_1.f1) AND (subselect_tbl.f1 = subselect_tbl_1.f2)) + -> Seq Scan on subselect_tbl (cost=0.00..431.00 rows=3 width=8) + -> Hash (cost=431.00..431.00 rows=3 width=8) + -> Seq Scan on subselect_tbl subselect_tbl_1 (cost=0.00..431.00 rows=3 width=8) + Optimizer: Pivotal Optimizer (GPORCA) +(8 rows) + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 ORDER BY f2 LIMIT 3); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------------- + Result (cost=0.00..1324038.57 rows=8 width=16) + -> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1324038.57 rows=8 width=8) + -> Seq Scan on subselect_tbl (cost=0.00..1324038.57 rows=3 width=8) + Filter: (SubPlan 1) + SubPlan 1 + -> Limit (cost=0.00..431.01 rows=1 width=4) + -> Sort (cost=0.00..431.01 rows=1 width=4) + Sort Key: subselect_tbl_1.f2 + -> Result (cost=0.00..431.01 rows=1 width=4) + Filter: (subselect_tbl_1.f1 = subselect_tbl.f1) + -> Materialize (cost=0.00..431.00 rows=8 width=8) + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=8 width=8) + -> Seq Scan on subselect_tbl subselect_tbl_1 (cost=0.00..431.00 rows=3 width=8) + Optimizer: Pivotal Optimizer (GPORCA) +(14 rows) + -- -- Test cases to catch unpleasant interactions between IN-join processing -- and subquery pullup. diff --git a/src/test/regress/sql/subselect_gp.sql b/src/test/regress/sql/subselect_gp.sql index de95b3d355..f83ddf6456 100644 --- a/src/test/regress/sql/subselect_gp.sql +++ b/src/test/regress/sql/subselect_gp.sql @@ -745,6 +745,24 @@ EXPLAIN SELECT '' AS five, f1 AS "Correlated Field" WHERE (f1, f2) IN (SELECT f2, CAST(f3 AS int4) FROM SUBSELECT_TBL WHERE f3 IS NOT NULL) ORDER BY 2; +-- Test simplify group-by/order-by inside subquery if sublink pull-up is possible +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 GROUP BY f2); + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 GROUP BY f2 LIMIT 3); + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 ORDER BY f2); + +EXPLAIN SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field" + FROM SUBSELECT_TBL upper + WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1 ORDER BY f2 LIMIT 3); + + -- -- Test cases to catch unpleasant interactions between IN-join processing -- and subquery pullup. --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
