(spark) branch master updated: [SPARK-53094][SQL] Fix CUBE with aggregate containing HAVING clauses

ptoth Tue, 05 Aug 2025 11:34:31 -0700

This is an automated email from the ASF dual-hosted git repository.

ptoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 3f0c4506f43a [SPARK-53094][SQL] Fix CUBE with aggregate containing 
HAVING clauses
3f0c4506f43a is described below

commit 3f0c4506f43a61d85c630057d40c83eb2c792d88
Author: Peter Toth <peter.t...@gmail.com>
AuthorDate: Tue Aug 5 20:33:59 2025 +0200

    [SPARK-53094][SQL] Fix CUBE with aggregate containing HAVING clauses
    
    ### What changes were proposed in this pull request?
    
    This is an alternative PR to https://github.com/apache/spark/pull/51810 to 
fix a regresion introduced in Spark 3.2 with 
https://github.com/apache/spark/pull/32470.
    This PR defers the resolution of not fully resolved `UnresolvedHaving` 
nodes from `ResolveGroupingAnalytics`:
    ```
    === Applying Rule 
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveGroupingAnalytics ===
     'Sort ['s DESC NULLS LAST], true                                           
                                                    'Sort ['s DESC NULLS LAST], 
true
    !+- 'UnresolvedHaving ('count('product) > 2)                                
                                                    +- 'UnresolvedHaving 
('count(tempresolvedcolumn(product#261, product, false)) > 2)
    !   +- 'Aggregate [cube(Vector(0), Vector(1), product#261, region#262)], 
[product#261, region#262, sum(amount#263) AS s#264L]      +- Aggregate 
[product#269, region#270, spark_grouping_id#268L], [product#269, region#270, 
sum(amount#263) AS s#264L]
    !      +- SubqueryAlias t                                                   
                                                          +- Expand 
[[product#261, region#262, amount#263, product#266, region#267, 0], 
[product#261, region#262, amount#263, product#266, null, 1], [product#261, 
region#262, amount#263, null, region#267, 2], [product#261, region#262, 
amount#263, null, null, 3]], [product#261, region#262, amount#263, product#269, 
region#270, spark_grouping_id#268L]
    !         +- LocalRelation [product#261, region#262, amount#263]            
                                                             +- Project 
[product#261, region#262, amount#263, product#261 AS product#266, region#262 AS 
region#267]
    !                                                                           
                                                                +- 
SubqueryAlias t
    !                                                                           
                                                                   +- 
LocalRelation [product#261, region#262, amount#263]
    ```
    to `ResolveAggregateFunctions` to add the correct aggregate expressions 
(`count(product#261)`):
    ```
    === Applying Rule 
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveAggregateFunctions ===
     'Sort ['s DESC NULLS LAST], true                                           
                                                                                
                                                                                
                                                                                
                                  'Sort ['s DESC NULLS LAST], true
    !+- 'UnresolvedHaving (count(tempresolvedcolumn(product#261, product, 
false)) > cast(2 as bigint))                                                    
                                                                                
                                                                                
                                        +- Project [product#269, region#270, 
s#264L]
    !   +- Aggregate [product#269, region#270, spark_grouping_id#268L], 
[product#269, region#270, sum(amount#263) AS s#264L]                            
                                                                                
                                                                                
                                             +- Filter (count(product)#272L > 
cast(2 as bigint))
    !      +- Expand [[product#261, region#262, amount#263, product#266, 
region#267, 0], [product#261, region#262, amount#263, product#266, null, 1], 
[product#261, region#262, amount#263, null, region#267, 2], [product#261, 
region#262, amount#263, null, null, 3]], [product#261, region#262, amount#263, 
product#269, region#270, spark_grouping_id#268L]         +- Aggregate 
[product#269, region#270, spark_grouping_id#268L], [product#269, region#270, 
sum(amount#263) AS s#264L, count(product#26 [...]
    !         +- Project [product#261, region#262, amount#263, product#261 AS 
product#266, region#262 AS region#267]                                          
                                                                                
                                                                                
                                             +- Expand [[product#261, 
region#262, amount#263, product#266, region#267, 0], [product#261, region#262, 
amount#263, product#266, n [...]
    !            +- SubqueryAlias t                                             
                                                                                
                                                                                
                                                                                
                                              +- Project [product#261, 
region#262, amount#263, product#261 AS product#266, region#262 AS region#267]
    !               +- LocalRelation [product#261, region#262, amount#263]      
                                                                                
                                                                                
                                                                                
                                                 +- SubqueryAlias t
    !                                                                           
                                                                                
                                                                                
                                                                                
                                                    +- LocalRelation 
[product#261, region#262, amount#263]
    ```
    
    ### Why are the changes needed?
    
    Fix a correctness isue described in 
https://github.com/apache/spark/pull/51810.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it fixes a correctness issue.
    
    ### How was this patch tested?
    
    Added new UT from https://github.com/apache/spark/pull/51810.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #51820 from peter-toth/SPARK-53094-fix-cube-having.
    
    Lead-authored-by: Peter Toth <peter.t...@gmail.com>
    Co-authored-by: harris233 <1657417...@qq.com>
    Signed-off-by: Peter Toth <peter.t...@gmail.com>
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala    |  4 ++++
 .../sql-tests/analyzer-results/grouping_set.sql.out      |  2 +-
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala  | 16 ++++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 12ba41145c20..e49e6aa7f044 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -792,6 +792,10 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
       } else {
         colResolved.havingCondition
       }
+      // `cond` might contain unresolved aggregate functions so defer its 
resolution to
+      // `ResolveAggregateFunctions` rule if needed.
+      if (!cond.resolved) return colResolved
+
       // Try resolving the condition of the filter as though it is in the 
aggregate clause
       val (extraAggExprs, Seq(resolvedHavingCond)) =
         ResolveAggregateFunctions.resolveExprsWithAggregate(Seq(cond), 
aggForResolving)
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out
index 254f9d078540..2c63fb1525a4 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/grouping_set.sql.out
@@ -116,7 +116,7 @@ FROM   (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, 
c2, c3)
 GROUP  BY GROUPING SETS ( ( c1 ), ( c2 ) )
 HAVING GROUPING__ID > 1
 -- !query analysis
-Filter (grouping__id#xL > cast(1 as bigint))
+Filter (GROUPING__ID#xL > cast(1 as bigint))
 +- Aggregate [c1#x, c2#x, spark_grouping_id#xL], [c1#x, c2#x, sum(c3#x) AS 
sum(c3)#xL, spark_grouping_id#xL AS grouping__id#xL]
    +- Expand [[c1#x, c2#x, c3#x, c1#x, null, 1], [c1#x, c2#x, c3#x, null, 
c2#x, 2]], [c1#x, c2#x, c3#x, c1#x, c2#x, spark_grouping_id#xL]
       +- Project [c1#x, c2#x, c3#x, c1#x AS c1#x, c2#x AS c2#x]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 107514edbc87..905f34cd7d34 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -5057,6 +5057,22 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-53094: Fix cube-related data quality problem") {
+    val df = sql(
+      """SELECT product, region, sum(amount) AS s
+        |FROM VALUES
+        |  ('a', 'east', 100),
+        |  ('b', 'east', 200),
+        |  ('a', 'west', 150),
+        |  ('b', 'west', 250),
+        |  ('a', 'east', 120) AS t(product, region, amount)
+        |GROUP BY product, region WITH CUBE
+        |HAVING count(product) > 2
+        |ORDER BY s DESC""".stripMargin)
+
+    checkAnswer(df, Seq(Row(null, null, 820), Row(null, "east", 420), Row("a", 
null, 370)))
+  }
 }
 
 case class Foo(bar: Option[String])


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-53094][SQL] Fix CUBE with aggregate containing HAVING clauses

Reply via email to