This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit c10ea53b69913e39f5b976e7239febb6e25d0795
Author: Alexander Kondakov <[email protected]>
AuthorDate: Tue Aug 29 12:27:27 2023 +0300

    Fix ORCA invalid processing of nested SubLinks under aggregates.
    
    The ORCA optimizer could produce an invalid query plan and fallback to the
    legacy optimizer due to incorrect processing of nested SubLinks (SubLink
    contains one more SubLink or rtable subquery inside) during query 
normalization.
    
    The issue could arise when query had a AggRef inside a Sublink and that
    AggRef contained one more SubLink inside (see the example of the
    query in the tests section).
    
    While mutating the SubLink inside RunExtractAggregatesMutator, the Aggref 
branch
    is executed for SubLink's query targetList, which mutates args of the 
AggRef.
    And if inside this AggRef arg some Var had varlevelsup value greater than 
the
    m_agg_levels_up, i.e the Var referenced a relation that is higher than the
    AggRef, the Var was not mutated at all. The varlevelsup field was not 
modified
    and because of that, when AggRef was pulled up to the zero query level (by
    appending it to the context.m_lower_table_tlist), the Var, whose varlevelsup
    was higher than AggRef's, and which remained unchanged, started referencing
    the wrong relation. This could lead to the fallback or invalid plan 
construction
    when processing attributes at further planning stages.
    
    This patch makes the RunExtractAggregatesMutator modify the varlevelsup of 
the
    Var relatively the AggRef level in order to preserve proper query level.
---
 src/backend/gpopt/translate/CQueryMutators.cpp     | 38 ++++++++++++++--------
 src/test/regress/expected/aggregates_optimizer.out |  2 --
 src/test/regress/expected/subselect.out            | 31 ++++++++++++++++++
 src/test/regress/expected/subselect_optimizer.out  | 33 +++++++++++++++++++
 src/test/regress/sql/subselect.sql                 |  9 +++++
 5 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/src/backend/gpopt/translate/CQueryMutators.cpp 
b/src/backend/gpopt/translate/CQueryMutators.cpp
index 6557918665..7a77934adb 100644
--- a/src/backend/gpopt/translate/CQueryMutators.cpp
+++ b/src/backend/gpopt/translate/CQueryMutators.cpp
@@ -698,22 +698,34 @@ CQueryMutators::RunExtractAggregatesMutator(Node *node,
                // Handle other top-level outer references in the project 
element.
                if (var->varlevelsup == context->m_current_query_level)
                {
-                       if (var->varlevelsup == context->m_agg_levels_up)
+                       if (var->varlevelsup >= context->m_agg_levels_up)
                        {
-                               // If Var references the top level query inside 
an Aggref that also
-                               // references top level query, the Aggref is 
moved to the derived query
-                               // (see comments in Aggref if-case above). 
Thus, these Var references
-                               // are brought up to the top-query level.
+                               // If Var references the top level query 
(varlevelsup = m_current_query_level)
+                               // inside an Aggref that also references top 
level query, the Aggref is moved
+                               // to the derived query (see comments in Aggref 
if-case above).
+                               // And, therefore, if we are mutating such Vars 
inside the Aggref, we must
+                               // change their varlevelsup field in order to 
preserve correct reference level.
+                               // i.e these Vars are pulled up as the part of 
the Aggref by the m_agg_levels_up.
                                // e.g:
-                               // explain select (select sum(foo.a) from jazz) 
from foo group by a, b;
+                               // select (select max((select foo.a))) from foo;
                                // is transformed into
-                               // select (select fnew.sum_t from jazz)
-                               // from (select foo.a, foo.b, sum(foo.a) sum_t
-                               //       from foo group by foo.a, foo.b) fnew;
-                               //
-                               // Note the foo.a var which is in sum() in a 
subquery must now become a
-                               // var referencing the current query level.
-                               var->varlevelsup = 0;
+                               // select (select fnew.max_t)
+                               // from (select max((select foo.a)) max_t from 
foo) fnew;
+                               // Here the foo.a inside max referenced top 
level RTE foo at
+                               // varlevelsup = 2 inside the Aggref at 
agglevelsup 1. Then the
+                               // Aggref is brought up to the top-query-level 
of fnew and foo.a
+                               // inside Aggref is bumped up by original 
Aggref's level.
+                               // We may visualize that logic with the 
following diagram:
+                               // Query <------┐  <--------------------┐
+                               //              |                       |
+                               //              | m_agg_levels_up = 1   |
+                               //              |                       |
+                               //     Aggref --┘                       | 
varlevelsup = 2
+                               //                                      |
+                               //                                      |
+                               //                                      |
+                               //         Var -------------------------┘
+                               var->varlevelsup -= context->m_agg_levels_up;
                                return (Node *) var;
                        }
 
diff --git a/src/test/regress/expected/aggregates_optimizer.out 
b/src/test/regress/expected/aggregates_optimizer.out
index 4b735570df..944c8cbf57 100644
--- a/src/test/regress/expected/aggregates_optimizer.out
+++ b/src/test/regress/expected/aggregates_optimizer.out
@@ -659,8 +659,6 @@ LINE 4:                where sum(distinct a.four + b.four) 
= b.four)...
 select
   (select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))
 from tenk1 o;
-INFO:  GPORCA failed to produce a plan, falling back to Postgres-based planner
-DETAIL:  Query-to-DXL Translation: No variable entry found due to incorrect 
normalization of query
  max  
 ------
  9999
diff --git a/src/test/regress/expected/subselect.out 
b/src/test/regress/expected/subselect.out
index 31bf220bd2..ea96cc3ca7 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -2066,4 +2066,35 @@ group by j, q1;
  2 | 1 |  2
 (1 row)
 
+-- Ensure that both planners produce valid plans for the query with the nested
+-- SubLink, and this SubLink is under aggregation. ORCA shouldn't fall back due
+-- to missing variable entry as a result of incorrect query normalization. ORCA
+-- should correctly process args of the aggregation during normalization.
+explain (verbose, costs off)
+select (select max((select t.i))) from t;
+                   QUERY PLAN                   
+------------------------------------------------
+ Finalize Aggregate
+   Output: (SubPlan 2)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL max((SubPlan 1)))
+         ->  Partial Aggregate
+               Output: PARTIAL max((SubPlan 1))
+               ->  Seq Scan on public.t
+                     Output: t.i, t.j
+               SubPlan 1
+                 ->  Result
+                       Output: t.i
+   SubPlan 2
+     ->  Result
+           Output: max((SubPlan 1))
+ Optimizer: Postgres-based planner
+(15 rows)
+
+select (select max((select t.i))) from t;
+ max 
+-----
+   1
+(1 row)
+
 drop table t;
diff --git a/src/test/regress/expected/subselect_optimizer.out 
b/src/test/regress/expected/subselect_optimizer.out
index 322e14951c..da6a4c95c5 100644
--- a/src/test/regress/expected/subselect_optimizer.out
+++ b/src/test/regress/expected/subselect_optimizer.out
@@ -2150,4 +2150,37 @@ group by j, q1;
  2 | 1 |  2
 (1 row)
 
+-- Ensure that both planners produce valid plans for the query with the nested
+-- SubLink, and this SubLink is under aggregation. ORCA shouldn't fall back due
+-- to missing variable entry as a result of incorrect query normalization. ORCA
+-- should correctly process args of the aggregation during normalization.
+explain (verbose, costs off)
+select (select max((select t.i))) from t;
+                   QUERY PLAN                   
+------------------------------------------------
+ Finalize Aggregate
+   Output: (SubPlan 2)
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         Output: (PARTIAL max((SubPlan 1)))
+         ->  Partial Aggregate
+               Output: PARTIAL max((SubPlan 1))
+               ->  Seq Scan on public.t
+                     Output: i
+               SubPlan 1
+                 ->  Result
+                       Output: t.i
+   SubPlan 2
+     ->  Result
+           Output: max((SubPlan 1))
+           ->  Result
+                 Output: true
+ Optimizer: GPORCA
+(17 rows)
+
+select (select max((select t.i))) from t;
+ max 
+-----
+   1
+(1 row)
+
 drop table t;
diff --git a/src/test/regress/sql/subselect.sql 
b/src/test/regress/sql/subselect.sql
index e2cccf1a95..02c11c1aac 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -1025,4 +1025,13 @@ select j, 1 as c,
 from t
 group by j, q1;
 
+-- Ensure that both planners produce valid plans for the query with the nested
+-- SubLink, and this SubLink is under aggregation. ORCA shouldn't fall back due
+-- to missing variable entry as a result of incorrect query normalization. ORCA
+-- should correctly process args of the aggregation during normalization.
+explain (verbose, costs off)
+select (select max((select t.i))) from t;
+
+select (select max((select t.i))) from t;
+
 drop table t;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to