This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 9f2fe29578058910742eeb42f097d76fd35e0b0e Author: Alexander Kondakov <[email protected]> AuthorDate: Mon Aug 14 10:53:10 2023 +0300 Fix ORCA invalid processing of nested SubLinks with GROUP BY attributes The ORCA optimizer could produce an invalid query plan due to incorrect processing of nested SubLinks (SubLink contains one more SubLink or rtable subquery inside) during query normalization. The incorrect behaviour took place when a query had a nested SubLink in its targetList, and SubLink lower levels contained attributes that were referenced in the query's GROUP BY clause. ORCA could construct for such queries an invalid plan, according to which some wrong and unexpected attributes got into the nested part of SubLink's plan (the example is provided in tests section). And the presence of those attributes could lead to incorrect query execution or even segmentation fault. Inside the NormalizeGroupByProjList function the context.m_lower_table_tlist, which contains grouping attributes from target_list_copy, is created. And the entries of target_list_copy, that are not grouping attributes, are mutated inside the RunExtractAggregatesMutator in order to make their inner expressions reference the corresponding entries from context.m_lower_table_tlist. The invalid plan could be produced because of incorrect mapping between Var's varattno (in target_list_copy) and some TargetEntry's resno (in context.m_lower_table_tlist). This mapping is established by RunExtractAggregatesMutator, where for the Var code branch the following assignment takes place: var->varattno = found_tle->resno, where found_tle is the corresponding entry from context.m_lower_table_tlist. However, the behaviour of this function was erroneous for the case of nested SubLinks because it mutated only the upper level of nested SubLink structure, and other subqueries remained unmodified because RunExtractAggregatesMutator did not have a branch for the Query, which was required for futher diving into lower subqueries. And because inner Vars hadn't been mutated, they started reference wrong attributes. This patch modifies RunExtractAggregatesMutator function by adding the Query branch to make the mutator correctly step into nested SubLink queries. And the call of MutateQueryOrExpressionTree is replaced with the call of mutator itself in order to correctly modify m_current_query_level when stepping into lower subqueries. --- src/backend/gpopt/translate/CQueryMutators.cpp | 38 +++++++++++++++--- src/test/regress/expected/bfv_olap.out | 1 - src/test/regress/expected/bfv_olap_optimizer.out | 3 -- src/test/regress/expected/subselect.out | 43 ++++++++++++++++++++ src/test/regress/expected/subselect_optimizer.out | 48 +++++++++++++++++++++++ src/test/regress/sql/bfv_olap.sql | 1 - src/test/regress/sql/subselect.sql | 26 ++++++++++++ 7 files changed, 150 insertions(+), 10 deletions(-) diff --git a/src/backend/gpopt/translate/CQueryMutators.cpp b/src/backend/gpopt/translate/CQueryMutators.cpp index 31c3c19405..59311bbad6 100644 --- a/src/backend/gpopt/translate/CQueryMutators.cpp +++ b/src/backend/gpopt/translate/CQueryMutators.cpp @@ -809,17 +809,45 @@ CQueryMutators::RunExtractAggregatesMutator(Node *node, GPOS_ASSERT(IsA(old_sublink->subselect, Query)); - new_sublink->subselect = gpdb::MutateQueryOrExpressionTree( - old_sublink->subselect, - (MutatorWalkerFn) RunExtractAggregatesMutator, (void *) context, - 0 // mutate into cte-lists - ); + // One need to call the Query mutator for subselect and take into + // account that SubLink can be multi-level. Therefore, the + // context->m_current_query_level must be modified properly + // while diving into such nested SubLink. + new_sublink->subselect = + RunExtractAggregatesMutator(old_sublink->subselect, context); context->m_current_query_level--; return (Node *) new_sublink; } + if (IsA(node, Query)) + { + // Mutate Query tree and ignore rtable subqueries in order to modify + // m_current_query_level properly when mutating them below. + Query *query = gpdb::MutateQueryTree( + (Query *) node, (MutatorWalkerFn) RunExtractAggregatesMutator, + context, QTW_IGNORE_RT_SUBQUERIES); + + ListCell *lc; + ForEach(lc, query->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + + if (RTE_SUBQUERY == rte->rtekind) + { + Query *subquery = rte->subquery; + context->m_current_query_level++; + rte->subquery = (Query *) RunExtractAggregatesMutator( + (Node *) subquery, context); + context->m_current_query_level--; + gpdb::GPDBFree(subquery); + } + } + + return (Node *) query; + } + return gpdb::MutateExpressionTree( node, (MutatorWalkerFn) RunExtractAggregatesMutator, context); } diff --git a/src/test/regress/expected/bfv_olap.out b/src/test/regress/expected/bfv_olap.out index eadb52b004..f913e2c03d 100644 --- a/src/test/regress/expected/bfv_olap.out +++ b/src/test/regress/expected/bfv_olap.out @@ -764,7 +764,6 @@ select * from (select sum(a.salary) over(), count(*) 2100 | 1 (2 rows) --- this query currently falls back, needs to be fixed select (select rn from (select row_number() over () as rn, name from t1_github_issue_10143 where code = a.code diff --git a/src/test/regress/expected/bfv_olap_optimizer.out b/src/test/regress/expected/bfv_olap_optimizer.out index 16de252c48..78f8628003 100644 --- a/src/test/regress/expected/bfv_olap_optimizer.out +++ b/src/test/regress/expected/bfv_olap_optimizer.out @@ -824,7 +824,6 @@ select * from (select sum(a.salary) over(), count(*) 2100 | 1 (2 rows) --- this query currently falls back, needs to be fixed select (select rn from (select row_number() over () as rn, name from t1_github_issue_10143 where code = a.code @@ -833,8 +832,6 @@ select (select rn from (select row_number() over () as rn, name ,sum(sum(a.salary)) over() from t2_github_issue_10143 a group by a.code; -INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner -DETAIL: Query-to-DXL Translation: No variable entry found due to incorrect normalization of query dongnm | sum --------+------ 1 | 2100 diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index add31f7eeb..7be004172b 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -1981,3 +1981,46 @@ select * from x for update; (5 rows) set gp_cte_sharing to off; +-- Ensure that both planners produce valid plans for the query with the nested +-- SubLink, which contains attributes referenced in query's GROUP BY clause. +-- Due to presence of non-grouping columns in targetList, ORCA performs query +-- normalization, during which ORCA establishes a correspondence between vars +-- from targetlist entries to grouping attributes. And this process should +-- correctly handle nested structures. The inner part of SubPlan in the test +-- should contain only t.j. +-- start_ignore +drop table if exists t; +NOTICE: table "t" does not exist, skipping +-- end_ignore +create table t (i int, j int) distributed by (i); +insert into t values (1, 2); +explain (verbose, costs off) +select j, +(select j from (select j) q2) +from t +group by i, j; + QUERY PLAN +------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Output: t.j, ((SubPlan 1)), t.i + -> HashAggregate + Output: t.j, (SubPlan 1), t.i + Group Key: t.i, t.j + -> Seq Scan on public.t + Output: t.i, t.j + SubPlan 1 + -> Result + Output: t.j + Optimizer: Postgres-based planner +(11 rows) + +select j, +(select j from (select j) q2) +from t +group by i, j; + j | j +---+--- + 2 | 2 +(1 row) + +drop table t; diff --git a/src/test/regress/expected/subselect_optimizer.out b/src/test/regress/expected/subselect_optimizer.out index b60d15f74d..38d440d121 100644 --- a/src/test/regress/expected/subselect_optimizer.out +++ b/src/test/regress/expected/subselect_optimizer.out @@ -2056,3 +2056,51 @@ select * from x for update; (6 rows) set gp_cte_sharing to off; +-- Ensure that both planners produce valid plans for the query with the nested +-- SubLink, which contains attributes referenced in query's GROUP BY clause. +-- Due to presence of non-grouping columns in targetList, ORCA performs query +-- normalization, during which ORCA establishes a correspondence between vars +-- from targetlist entries to grouping attributes. And this process should +-- correctly handle nested structures. The inner part of SubPlan in the test +-- should contain only t.j. +-- start_ignore +drop table if exists t; +NOTICE: table "t" does not exist, skipping +-- end_ignore +create table t (i int, j int) distributed by (i); +insert into t values (1, 2); +explain (verbose, costs off) +select j, +(select j from (select j) q2) +from t +group by i, j; + QUERY PLAN +------------------------------------------ + Gather Motion 3:1 (slice1; segments: 3) + Output: j, ((SubPlan 1)) + -> GroupAggregate + Output: j, (SubPlan 1) + Group Key: t.i, t.j + -> Sort + Output: i, j + Sort Key: t.i, t.j + -> Seq Scan on public.t + Output: i, j + SubPlan 1 + -> Result + Output: t.j + -> Result + Output: true + Optimizer: GPORCA +(16 rows) + +select j, +(select j from (select j) q2) +from t +group by i, j; + j | j +---+--- + 2 | 2 +(1 row) + +drop table t; diff --git a/src/test/regress/sql/bfv_olap.sql b/src/test/regress/sql/bfv_olap.sql index c1a7a7759e..2b5b1d67fa 100644 --- a/src/test/regress/sql/bfv_olap.sql +++ b/src/test/regress/sql/bfv_olap.sql @@ -449,7 +449,6 @@ select * from (select sum(a.salary) over(), count(*) from t2_github_issue_10143 a group by a.salary) T; --- this query currently falls back, needs to be fixed select (select rn from (select row_number() over () as rn, name from t1_github_issue_10143 where code = a.code diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 414f93beaf..4ee4f8f8f9 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -981,3 +981,29 @@ with x as (select * from subselect_tbl) select * from x for update; set gp_cte_sharing to off; + +-- Ensure that both planners produce valid plans for the query with the nested +-- SubLink, which contains attributes referenced in query's GROUP BY clause. +-- Due to presence of non-grouping columns in targetList, ORCA performs query +-- normalization, during which ORCA establishes a correspondence between vars +-- from targetlist entries to grouping attributes. And this process should +-- correctly handle nested structures. The inner part of SubPlan in the test +-- should contain only t.j. +-- start_ignore +drop table if exists t; +-- end_ignore +create table t (i int, j int) distributed by (i); +insert into t values (1, 2); + +explain (verbose, costs off) +select j, +(select j from (select j) q2) +from t +group by i, j; + +select j, +(select j from (select j) q2) +from t +group by i, j; + +drop table t; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
