This is an automated email from the ASF dual-hosted git repository. jiaqizho pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit ff712241cd87efa03695bd980bb77607db95f010 Author: gpopt <103469259+gp...@users.noreply.github.com> AuthorDate: Mon Feb 13 22:45:58 2023 -0800 Fix bug that Orca fails to decorrelate subqueries order by outer reference (#14905) This patch reinstates the logic before commit 30e79d2, so that when there is order by clause in the subquery, we can safely remove the order by clause. The previous logic was not wrong, but will keep the outer reference as correlated subquery, which is also worse plan but correct correlated plan. But in our preprocessing step, 26th, it is doing wrong on decorrelating subquery, in which case, we can't decorrelate actually. The commit 30e79d2 added a fix that allows adding a project in the translator that will echo the outer ref from within the subquery. `select foo.c from foo where b in (select bar.q from bar where bar.r =10 order by foo.c);` ORCA is currently unable to decorrelate sub-queries that contain project nodes.For the above query the parser adds foo.c to the target list of the subquery, that would get an echo projection and thus would prevent decorelation by ORCA as per the mentioned commit. This patch will not project a ORDER BY column so that we can safely remove the order by clause. Author: @DevChattopadhyay --- .../gpopt/translate/CTranslatorQueryToDXL.cpp | 10 +- src/test/regress/expected/subselect_gp.out | 135 +++++++++++++++++++++ .../regress/expected/subselect_gp_optimizer.out | 134 ++++++++++++++++++++ src/test/regress/sql/subselect_gp.sql | 18 +++ 4 files changed, 293 insertions(+), 4 deletions(-) diff --git a/src/backend/gpopt/translate/CTranslatorQueryToDXL.cpp b/src/backend/gpopt/translate/CTranslatorQueryToDXL.cpp index 68b379be26..cc006fd15e 100644 --- a/src/backend/gpopt/translate/CTranslatorQueryToDXL.cpp +++ b/src/backend/gpopt/translate/CTranslatorQueryToDXL.cpp @@ -4227,17 +4227,19 @@ CTranslatorQueryToDXL::TranslateTargetListToDXLProject( GPOS_WSZ_LIT("Grouping function with outer references")); } } - else if (!is_groupby || (is_groupby && is_grouping_col)) + else if (!is_groupby || is_grouping_col) { // Insist projection for any outer refs to ensure any decorelation of a // subquery results in a correct plan using the projected reference, // instead of the outer ref directly. // TODO: Remove is_grouping_col from this check once const projections in // subqueries no longer prevent decorrelation + BOOL is_orderby_col = CTranslatorUtils::IsSortingColumn( + target_entry, m_query->sortClause); BOOL insist_proj = - (IsA(target_entry->expr, Var) && - ((Var *) (target_entry->expr))->varlevelsup > 0 && - !is_grouping_col); + IsA(target_entry->expr, Var) && + ((Var *) (target_entry->expr))->varlevelsup > 0 && + !is_orderby_col && !is_grouping_col; CDXLNode *project_elem_dxlnode = TranslateExprToDXLProject( target_entry->expr, target_entry->resname, insist_proj /* insist_new_colids */); diff --git a/src/test/regress/expected/subselect_gp.out b/src/test/regress/expected/subselect_gp.out index c447bcc555..ef30988d24 100644 --- a/src/test/regress/expected/subselect_gp.out +++ b/src/test/regress/expected/subselect_gp.out @@ -3312,3 +3312,138 @@ drop table sublink_outer_table; drop table sublink_inner_table; reset optimizer; reset enable_hashagg; +-- Ensure sub-queries with order by outer reference can be decorrelated and executed correctly. +create table r(a int, b int, c int) distributed by (a); +create table s(a int, b int, c int) distributed by (a); +insert into r values (1,2,3); +insert into s values (1,2,10); +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c); + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = s.b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Postgres query optimizer +(9 rows) + +select * from r where b in (select b from s where c=10 order by r.c); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c limit 2); + QUERY PLAN +----------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Seq Scan on r + Filter: (SubPlan 1) + SubPlan 1 + -> Limit + -> Result + -> Materialize + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Postgres query optimizer +(11 rows) + +select * from r where b in (select b from s where c=10 order by r.c limit 2); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c, b); + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = s.b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Postgres query optimizer +(9 rows) + +select * from r where b in (select b from s where c=10 order by r.c, b); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c, b limit 2); + QUERY PLAN +----------------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Seq Scan on r + Filter: (SubPlan 1) + SubPlan 1 + -> Limit + -> Sort + Sort Key: s.b + -> Result + -> Materialize + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Postgres query optimizer +(13 rows) + +select * from r where b in (select b from s where c=10 order by r.c, b limit 2); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by c); + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = s.b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Postgres query optimizer +(9 rows) + +select * from r where b in (select b from s where c=10 order by c); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by c limit 2); + QUERY PLAN +------------------------------------------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = "ANY_subquery".b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 1:3 (slice2; segments: 1) + -> Subquery Scan on "ANY_subquery" + -> Limit + -> Gather Motion 3:1 (slice3; segments: 3) + -> Limit + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Postgres query optimizer +(13 rows) + +select * from r where b in (select b from s where c=10 order by c limit 2); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + diff --git a/src/test/regress/expected/subselect_gp_optimizer.out b/src/test/regress/expected/subselect_gp_optimizer.out index 335ec3d220..ad2773ca97 100644 --- a/src/test/regress/expected/subselect_gp_optimizer.out +++ b/src/test/regress/expected/subselect_gp_optimizer.out @@ -3416,3 +3416,137 @@ drop table sublink_outer_table; drop table sublink_inner_table; reset optimizer; reset enable_hashagg; +-- Ensure sub-queries with order by outer reference can be decorrelated and executed correctly. +create table r(a int, b int, c int) distributed by (a); +create table s(a int, b int, c int) distributed by (a); +insert into r values (1,2,3); +insert into s values (1,2,10); +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c); + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = s.b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Pivotal Optimizer (GPORCA) +(9 rows) + +select * from r where b in (select b from s where c=10 order by r.c); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c limit 2); + QUERY PLAN +------------------------------------------------------------ + Hash Semi Join + Hash Cond: (r.b = s.b) + -> Gather Motion 3:1 (slice1; segments: 3) + -> Seq Scan on r + -> Hash + -> Limit + -> Gather Motion 3:1 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Pivotal Optimizer (GPORCA) +(10 rows) + +select * from r where b in (select b from s where c=10 order by r.c limit 2); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c, b); + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = s.b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Pivotal Optimizer (GPORCA) +(9 rows) + +select * from r where b in (select b from s where c=10 order by r.c, b); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c, b limit 2); + QUERY PLAN +------------------------------------------------------------ + Hash Semi Join + Hash Cond: (r.b = s.b) + -> Gather Motion 3:1 (slice1; segments: 3) + -> Seq Scan on r + -> Hash + -> Limit + -> Gather Motion 3:1 (slice2; segments: 3) + Merge Key: s.b + -> Sort + Sort Key: s.b + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Pivotal Optimizer (GPORCA) +(13 rows) + +select * from r where b in (select b from s where c=10 order by r.c, b limit 2); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by c); + QUERY PLAN +--------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + -> Hash Semi Join + Hash Cond: (r.b = s.b) + -> Seq Scan on r + -> Hash + -> Broadcast Motion 3:3 (slice2; segments: 3) + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Pivotal Optimizer (GPORCA) +(9 rows) + +select * from r where b in (select b from s where c=10 order by c); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + +explain (costs off) select * from r where b in (select b from s where c=10 order by c limit 2); + QUERY PLAN +------------------------------------------------------------ + Hash Semi Join + Hash Cond: (r.b = s.b) + -> Gather Motion 3:1 (slice1; segments: 3) + -> Seq Scan on r + -> Hash + -> Limit + -> Gather Motion 3:1 (slice2; segments: 3) + Merge Key: s.c + -> Sort + Sort Key: s.c + -> Seq Scan on s + Filter: (c = 10) + Optimizer: Pivotal Optimizer (GPORCA) +(13 rows) + +select * from r where b in (select b from s where c=10 order by c limit 2); + a | b | c +---+---+--- + 1 | 2 | 3 +(1 row) + diff --git a/src/test/regress/sql/subselect_gp.sql b/src/test/regress/sql/subselect_gp.sql index b8ba35930b..9efd19f2be 100644 --- a/src/test/regress/sql/subselect_gp.sql +++ b/src/test/regress/sql/subselect_gp.sql @@ -1319,3 +1319,21 @@ drop table sublink_outer_table; drop table sublink_inner_table; reset optimizer; reset enable_hashagg; + +-- Ensure sub-queries with order by outer reference can be decorrelated and executed correctly. +create table r(a int, b int, c int) distributed by (a); +create table s(a int, b int, c int) distributed by (a); +insert into r values (1,2,3); +insert into s values (1,2,10); +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c); +select * from r where b in (select b from s where c=10 order by r.c); +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c limit 2); +select * from r where b in (select b from s where c=10 order by r.c limit 2); +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c, b); +select * from r where b in (select b from s where c=10 order by r.c, b); +explain (costs off) select * from r where b in (select b from s where c=10 order by r.c, b limit 2); +select * from r where b in (select b from s where c=10 order by r.c, b limit 2); +explain (costs off) select * from r where b in (select b from s where c=10 order by c); +select * from r where b in (select b from s where c=10 order by c); +explain (costs off) select * from r where b in (select b from s where c=10 order by c limit 2); +select * from r where b in (select b from s where c=10 order by c limit 2); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org For additional commands, e-mail: commits-h...@cloudberry.apache.org