This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch 
yjhjstz/fix/orca-use-after-free-having-decorrelation
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 69a2a53294f12c601cca4fe8a257cd462d24b8f3
Author: Jianghua Yang <[email protected]>
AuthorDate: Sat Mar 14 03:51:08 2026 +0800

    ORCA: Fix incorrect decorrelation of GROUP BY () HAVING <outer_ref>
    
    Force correlated execution (SubPlan) for scalar subqueries with
    GROUP BY () and a correlated HAVING clause. Previously ORCA
    decorrelated such subqueries into Left Outer Join + COALESCE(count,0),
    which incorrectly returned 0 instead of NULL when the HAVING
    condition was false.
    
    Add FHasCorrelatedSelectAboveGbAgg() to detect the pattern where
    NormalizeHaving() has converted the HAVING clause into a
    CLogicalSelect with outer refs above a CLogicalGbAgg with empty
    grouping columns. When detected, set m_fCorrelatedExecution = true
    in Psd() to bypass the COALESCE decorrelation path.
    
    Update groupingsets_optimizer.out expected output to reflect the
    new ORCA SubPlan format instead of Postgres planner fallback.
    
    Reported-in: https://github.com/apache/cloudberry/issues/1618
---
 .../libgpopt/src/xforms/CSubqueryHandler.cpp       | 115 ++++++++++++++++++++-
 .../regress/expected/groupingsets_optimizer.out    |  28 ++---
 2 files changed, 128 insertions(+), 15 deletions(-)

diff --git a/src/backend/gporca/libgpopt/src/xforms/CSubqueryHandler.cpp 
b/src/backend/gporca/libgpopt/src/xforms/CSubqueryHandler.cpp
index 791ee8345a9..a3c333c5ab1 100644
--- a/src/backend/gporca/libgpopt/src/xforms/CSubqueryHandler.cpp
+++ b/src/backend/gporca/libgpopt/src/xforms/CSubqueryHandler.cpp
@@ -308,6 +308,93 @@ CSubqueryHandler::FProjectCountSubquery(CExpression 
*pexprSubquery,
 }
 
 
+//---------------------------------------------------------------------------
+//     @function:
+//             FContainsEmptyGbAgg
+//
+//     @doc:
+//             Return true if pexpr contains a GbAgg with empty grouping 
columns
+//             (i.e., GROUP BY ())
+//
+//---------------------------------------------------------------------------
+static BOOL
+FContainsEmptyGbAgg(CExpression *pexpr)
+{
+       if (COperator::EopLogicalGbAgg == pexpr->Pop()->Eopid())
+       {
+               return 0 == 
CLogicalGbAgg::PopConvert(pexpr->Pop())->Pdrgpcr()->Size();
+       }
+       const ULONG arity = pexpr->Arity();
+       for (ULONG ul = 0; ul < arity; ul++)
+       {
+               CExpression *pexprChild = (*pexpr)[ul];
+               if (pexprChild->Pop()->FLogical() && 
FContainsEmptyGbAgg(pexprChild))
+               {
+                       return true;
+               }
+       }
+       return false;
+}
+
+
+//---------------------------------------------------------------------------
+//     @function:
+//             FHasCorrelatedSelectAboveGbAgg
+//
+//     @doc:
+//             Return true if pexpr has a CLogicalSelect with outer references 
in its
+//             filter predicate that sits above a GROUP BY () aggregate.  This 
pattern
+//             arises when a correlated scalar subquery has a correlated 
HAVING clause,
+//             e.g. "SELECT count(*) FROM t GROUP BY () HAVING outer_col".
+//
+//             When such a pattern exists the scalar subquery must NOT be 
decorrelated
+//             with COALESCE(count,0) semantics: if the HAVING condition is 
false the
+//             subquery should return NULL (no rows), not 0.
+//
+//---------------------------------------------------------------------------
+static BOOL
+FHasCorrelatedSelectAboveGbAgg(CExpression *pexpr)
+{
+       // Stop recursion at a GbAgg boundary: we are looking for a Select
+       // that sits *above* a GbAgg, so once we reach the GbAgg there is
+       // nothing more to check in this branch.
+       if (COperator::EopLogicalGbAgg == pexpr->Pop()->Eopid())
+       {
+               return false;
+       }
+
+       if (COperator::EopLogicalSelect == pexpr->Pop()->Eopid() &&
+               pexpr->HasOuterRefs())
+       {
+               // The Select has outer references somewhere in its subtree.
+               // Check whether they originate from the filter (child 1) rather
+               // than from the logical child (child 0).  If the logical child 
has
+               // no outer refs but the Select as a whole does, the outer refs 
must
+               // come from the filter predicate — exactly the 
correlated-HAVING
+               // pattern we want to detect.
+               CExpression *pexprLogicalChild = (*pexpr)[0];
+               if (!pexprLogicalChild->HasOuterRefs() &&
+                       FContainsEmptyGbAgg(pexprLogicalChild))
+               {
+                       return true;
+               }
+       }
+
+       // Recurse into logical children only.
+       const ULONG arity = pexpr->Arity();
+       for (ULONG ul = 0; ul < arity; ul++)
+       {
+               CExpression *pexprChild = (*pexpr)[ul];
+               if (pexprChild->Pop()->FLogical() &&
+                       FHasCorrelatedSelectAboveGbAgg(pexprChild))
+               {
+                       return true;
+               }
+       }
+       return false;
+}
+
+
 //---------------------------------------------------------------------------
 //     @function:
 //             CSubqueryHandler::SSubqueryDesc::SetCorrelatedExecution
@@ -382,6 +469,21 @@ CSubqueryHandler::Psd(CMemoryPool *mp, CExpression 
*pexprSubquery,
        // set flag of correlated execution
        psd->SetCorrelatedExecution();
 
+       // A correlated scalar subquery of the form
+       //   SELECT count(*) FROM t GROUP BY () HAVING <outer_ref_condition>
+       // must execute as a correlated SubPlan.  After NormalizeHaving() the 
HAVING
+       // clause becomes a CLogicalSelect with outer refs sitting above the 
GbAgg.
+       // If we decorrelate such a subquery the join filter replaces the HAVING
+       // condition, but a LEFT JOIN returns 0 (not NULL) for count(*) when no
+       // rows match — which is semantically wrong.  Forcing correlated 
execution
+       // preserves the correct NULL-when-no-rows semantics.
+       if (!psd->m_fCorrelatedExecution && psd->m_fHasCountAgg &&
+               psd->m_fHasOuterRefs &&
+               FHasCorrelatedSelectAboveGbAgg(pexprInner))
+       {
+               psd->m_fCorrelatedExecution = true;
+       }
+
        return psd;
 }
 
@@ -753,8 +855,19 @@ CSubqueryHandler::FCreateOuterApplyForScalarSubquery(
        *ppexprNewOuter = pexprPrj;
 
        BOOL fGeneratedByQuantified = popSubquery->FGeneratedByQuantified();
+
+       // When GROUP BY () has a correlated HAVING clause (now represented as a
+       // CLogicalSelect with outer refs sitting above the GbAgg), the subquery
+       // must return NULL — not 0 — when the HAVING condition is false.
+       // Applying COALESCE(count,0) would incorrectly convert that NULL to 0,
+       // so we skip the special count(*) semantics in that case.
+       BOOL fCorrelatedHavingAboveEmptyGby =
+               (fHasCountAggMatchingColumn && 0 == pgbAgg->Pdrgpcr()->Size() &&
+                FHasCorrelatedSelectAboveGbAgg((*pexprSubquery)[0]));
+
        if (fGeneratedByQuantified ||
-               (fHasCountAggMatchingColumn && 0 == pgbAgg->Pdrgpcr()->Size()))
+               (fHasCountAggMatchingColumn && 0 == pgbAgg->Pdrgpcr()->Size() &&
+                !fCorrelatedHavingAboveEmptyGby))
        {
                CMDAccessor *md_accessor = COptCtxt::PoctxtFromTLS()->Pmda();
                const IMDTypeInt8 *pmdtypeint8 = 
md_accessor->PtMDType<IMDTypeInt8>();
diff --git a/src/test/regress/expected/groupingsets_optimizer.out 
b/src/test/regress/expected/groupingsets_optimizer.out
index 08ef4c1a68c..3718069c36c 100644
--- a/src/test/regress/expected/groupingsets_optimizer.out
+++ b/src/test/regress/expected/groupingsets_optimizer.out
@@ -958,21 +958,21 @@ select v.c, (select count(*) from gstest2 group by () 
having v.c)
 explain (costs off)
   select v.c, (select count(*) from gstest2 group by () having v.c)
     from (values (false),(true)) v(c) order by v.c;
-                                QUERY PLAN                                
---------------------------------------------------------------------------
- Sort
-   Sort Key: "*VALUES*".column1
-   ->  Values Scan on "*VALUES*"
-         SubPlan 1
-           ->  Aggregate
-                 Group Key: ()
-                 Filter: "*VALUES*".column1
-                 ->  Result
-                       One-Time Filter: "*VALUES*".column1
-                       ->  Materialize
-                             ->  Gather Motion 3:1  (slice1; segments: 3)
+                             QUERY PLAN
+--------------------------------------------------------------------
+ Result
+   ->  Sort
+         Sort Key: "Values".column1
+         ->  Values Scan on "Values"
+   SubPlan 1
+     ->  Result
+           One-Time Filter: "Values".column1
+           ->  Finalize Aggregate
+                 ->  Materialize
+                       ->  Gather Motion 3:1  (slice1; segments: 3)
+                             ->  Partial Aggregate
                                    ->  Seq Scan on gstest2
- Optimizer: Postgres query optimizer
+ Optimizer: GPORCA
 (13 rows)
 
 -- HAVING with GROUPING queries


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to