(spark) branch master updated: [SPARK-52079] Normalize order of attributes in inner project lists

wenchen Mon, 12 May 2025 05:05:21 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 37028fafc4f9 [SPARK-52079] Normalize order of attributes in inner 
project lists
37028fafc4f9 is described below

commit 37028fafc4f9fc873195a88f0840ab69edcf9d2b
Author: Mihailo Timotic <mihailo.timo...@databricks.com>
AuthorDate: Mon May 12 14:05:00 2025 +0200

    [SPARK-52079] Normalize order of attributes in inner project lists
    
    ### What changes were proposed in this pull request?
    Normalize order of attributes in inner project lists. This PR is 
reintroducing a change made in https://github.com/apache/spark/pull/49230. This 
change was reverted in https://github.com/apache/spark/pull/49460 as it was no 
longer necessary, but is now needed again for a different use case
    
    ### Why are the changes needed?
    Fixed-point analyzer resolves `Sort` and `Having` on top of an `Aggregate` 
via `TempResolvedColumn`. By doing this, fixed-point first extends an 
`Aggregate` with the resolved column and then proceeds with other resolution 
rules. This is a problem when resolving LCAs in fixed-point and single-pass, 
because single-pass adds this missing column after LCA resolution and fixed 
point first adds the column and only then performs LCA resolution. This causes 
plan mismatches where we have diffe [...]
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Added test cases.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #50858 from mihailotim-db/mihailotim-db/normalize_inner_project.
    
    Authored-by: Mihailo Timotic <mihailo.timo...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/plans/NormalizePlan.scala   | 25 +++++++++++++++-------
 .../sql/catalyst/plans/NormalizePlanSuite.scala    | 20 +++++++++++++++++
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala
index 18339e81b682..bf2b38c45f85 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/NormalizePlan.scala
@@ -154,15 +154,14 @@ object NormalizePlan extends PredicateHelper {
             
.getTagValue(DeduplicateRelations.PROJECT_FOR_EXPRESSION_ID_DEDUPLICATION)
             .isDefined =>
         project.child
+      case Project(outerProjectList, innerProject: Project) =>
+        val normalizedInnerProjectList = 
normalizeProjectList(innerProject.projectList)
+        val orderedInnerProjectList = normalizedInnerProjectList.sortBy(_.name)
+        val newInnerProject =
+          Project(orderedInnerProjectList, innerProject.child)
+        Project(normalizeProjectList(outerProjectList), newInnerProject)
       case Project(projectList, child) =>
-        val projList = projectList
-          .map { e =>
-            e.transformUp {
-              case g: GetViewColumnByNameAndOrdinal => g.copy(viewDDL = None)
-            }
-          }
-          .asInstanceOf[Seq[NamedExpression]]
-        Project(projList, child)
+        Project(normalizeProjectList(projectList), child)
       case c: KeepAnalyzedQuery => c.storeAnalyzedQuery()
       case localRelation: LocalRelation if !localRelation.data.isEmpty =>
         /**
@@ -198,6 +197,16 @@ object NormalizePlan extends PredicateHelper {
     case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => 
GreaterThanOrEqual(r, l)
     case _ => condition // Don't reorder.
   }
+
+  private def normalizeProjectList(projectList: Seq[NamedExpression]): 
Seq[NamedExpression] = {
+    projectList
+      .map { e =>
+        e.transformUp {
+          case g: GetViewColumnByNameAndOrdinal => g.copy(viewDDL = None)
+        }
+      }
+      .asInstanceOf[Seq[NamedExpression]]
+  }
 }
 
 class CteIdNormalizer {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala
index d2ac103e14f7..575b10c13349 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/NormalizePlanSuite.scala
@@ -38,6 +38,26 @@ import org.apache.spark.sql.types.BooleanType
 
 class NormalizePlanSuite extends SparkFunSuite with SQLConfHelper {
 
+  test("Normalize Project") {
+    val baselineCol1 = $"col1".int
+    val testCol1 = baselineCol1.newInstance()
+    val baselinePlan = LocalRelation(baselineCol1).select(baselineCol1)
+    val testPlan = LocalRelation(testCol1).select(testCol1)
+
+    assert(baselinePlan != testPlan)
+    assert(NormalizePlan(baselinePlan) == NormalizePlan(testPlan))
+  }
+
+  test("Normalize ordering in a project list of an inner Project") {
+    val baselinePlan =
+      LocalRelation($"col1".int, $"col2".string).select($"col1", 
$"col2").select($"col1")
+    val testPlan =
+      LocalRelation($"col1".int, $"col2".string).select($"col2", 
$"col1").select($"col1")
+
+    assert(baselinePlan != testPlan)
+    assert(NormalizePlan(baselinePlan) == NormalizePlan(testPlan))
+  }
+
   test("Normalize InheritAnalysisRules expressions") {
     val castWithoutTimezone =
       Cast(child = Literal(1), dataType = BooleanType, ansiEnabled = 
conf.ansiEnabled)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-52079] Normalize order of attributes in inner project lists

Reply via email to