(spark) branch branch-3.5 updated: [SPARK-48428][SQL] Fix IllegalStateException in NestedColumnAliasing

wenchen Thu, 27 Jun 2024 00:32:17 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new d35af54842d1 [SPARK-48428][SQL] Fix IllegalStateException in 
NestedColumnAliasing
d35af54842d1 is described below

commit d35af54842d17725aea7208c9d7b170b33a95012
Author: Emil Ejbyfeldt <[email protected]>
AuthorDate: Thu Jun 27 15:31:20 2024 +0800

    [SPARK-48428][SQL] Fix IllegalStateException in NestedColumnAliasing
    
    ### What changes were proposed in this pull request?
    
    In #35170 SPARK-37855 and #32301 SPARK-35194 introduced conditions for 
ExtractValues that can currently not be handled. The considtion is introduced 
after `collectRootReferenceAndExtractValue` and just removes these candidates. 
This is problematic since these expressions might have contained 
`AttributeReference` that needed to not do an incorrect aliasing. This fixes 
this family of bugs by moving the conditions into the function 
`collectRootReferenceAndExtractValue`.
    
    ### Why are the changes needed?
    
    The current code leads to `IllegalStateException` runtime failures.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, fixes a bug.
    
    ### How was this patch tested?
    
    Existing and new unit tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #46756 from eejbyfeldt/SPARK-48428.
    
    Authored-by: Emil Ejbyfeldt <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit b11608c96f8aeeaa03c6e5038700483266b32448)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../catalyst/optimizer/NestedColumnAliasing.scala   | 16 ++++++++--------
 .../optimizer/NestedColumnAliasingSuite.scala       | 21 +++++++++++++++++++++
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index 5d4fcf772b8f..778813e4e9c6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -217,6 +217,11 @@ object NestedColumnAliasing {
     case _ => false
   }
 
+  private def canAlias(ev: Expression): Boolean = {
+    // we can not alias the attr from lambda variable whose expr id is not 
available
+    !ev.exists(_.isInstanceOf[NamedLambdaVariable]) && ev.references.size == 1
+  }
+
   /**
    * Returns two types of expressions:
    * - Root references that are individually accessed
@@ -225,11 +230,11 @@ object NestedColumnAliasing {
    */
   private def collectRootReferenceAndExtractValue(e: Expression): 
Seq[Expression] = e match {
     case _: AttributeReference => Seq(e)
-    case GetStructField(_: ExtractValue | _: AttributeReference, _, _) => 
Seq(e)
+    case GetStructField(_: ExtractValue | _: AttributeReference, _, _) if 
canAlias(e) => Seq(e)
     case GetArrayStructFields(_: MapValues |
                               _: MapKeys |
                               _: ExtractValue |
-                              _: AttributeReference, _, _, _, _) => Seq(e)
+                              _: AttributeReference, _, _, _, _) if 
canAlias(e) => Seq(e)
     case es if es.children.nonEmpty => 
es.children.flatMap(collectRootReferenceAndExtractValue)
     case _ => Seq.empty
   }
@@ -248,13 +253,8 @@ object NestedColumnAliasing {
     val otherRootReferences = new mutable.ArrayBuffer[AttributeReference]()
     exprList.foreach { e =>
       extractor(e).foreach {
-        // we can not alias the attr from lambda variable whose expr id is not 
available
-        case ev: ExtractValue if 
!ev.exists(_.isInstanceOf[NamedLambdaVariable]) =>
-          if (ev.references.size == 1) {
-            nestedFieldReferences.append(ev)
-          }
+        case ev: ExtractValue => nestedFieldReferences.append(ev)
         case ar: AttributeReference => otherRootReferences.append(ar)
-        case _ => // ignore
       }
     }
     val exclusiveAttrSet = AttributeSet(exclusiveAttrs ++ otherRootReferences)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
index cb6b9ac8d8be..6ce394dbd68b 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
@@ -861,6 +861,27 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
     // The plan is expected to be unchanged.
     comparePlans(plan, RemoveNoopOperators.apply(optimized.get))
   }
+
+  test("SPARK-48428: Do not pushdown when attr is used in expression with 
mutliple references") {
+    val query = contact
+      .limit(5)
+      .select(
+        GetStructField(GetStructField(CreateStruct(Seq($"id", $"employer")), 
1), 0),
+        $"employer.id")
+      .analyze
+
+    val optimized = Optimize.execute(query)
+
+    val expected = contact
+      .select($"id", $"employer")
+      .limit(5)
+      .select(
+        GetStructField(GetStructField(CreateStruct(Seq($"id", $"employer")), 
1), 0),
+        $"employer.id")
+      .analyze
+
+    comparePlans(optimized, expected)
+  }
 }
 
 object NestedColumnAliasingSuite {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch branch-3.5 updated: [SPARK-48428][SQL] Fix IllegalStateException in NestedColumnAliasing

Reply via email to