[spark] branch master updated: [SPARK-36718][SQL][FOLLOWUP] Improve the extract-only check in CollapseProject

wenchen Tue, 17 May 2022 00:57:42 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 98fad57221d [SPARK-36718][SQL][FOLLOWUP] Improve the extract-only 
check in CollapseProject
98fad57221d is described below

commit 98fad57221d4dffc6f1fe28d9aca1093172ecf72
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Tue May 17 15:56:47 2022 +0800

    [SPARK-36718][SQL][FOLLOWUP] Improve the extract-only check in 
CollapseProject
    
    ### What changes were proposed in this pull request?
    
    This is a followup of https://github.com/apache/spark/pull/36510 , to fix a 
corner case: if the `CreateStruct` is only referenced once in non-extract 
expressions, we should still allow collapsing the projects.
    
    ### Why are the changes needed?
    
    completely fix the perf regression
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    a new test
    
    Closes #36572 from cloud-fan/regression.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala  | 16 +++++++++-------
 .../sql/catalyst/optimizer/CollapseProjectSuite.scala    | 11 +++++++++++
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 9215609f154..2f93cf2d8c3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1008,20 +1008,22 @@ object CollapseProject extends Rule[LogicalPlan] with 
AliasHelper {
           val producer = producerMap.getOrElse(reference, reference)
           producer.deterministic && (count == 1 || alwaysInline || {
             val relatedConsumers = 
consumers.filter(_.references.contains(reference))
-            val extractOnly = relatedConsumers.forall(isExtractOnly(_, 
reference))
+            // It's still exactly-only if there is only one reference in 
non-extract expressions,
+            // as we won't duplicate the expensive CreateStruct-like 
expressions.
+            val extractOnly = relatedConsumers.map(refCountInNonExtract(_, 
reference)).sum <= 1
             shouldInline(producer, extractOnly)
           })
       }
   }
 
-  private def isExtractOnly(expr: Expression, ref: Attribute): Boolean = {
-    def hasRefInNonExtractValue(e: Expression): Boolean = e match {
-      case a: Attribute => a.semanticEquals(ref)
+  private def refCountInNonExtract(expr: Expression, ref: Attribute): Int = {
+    def refCount(e: Expression): Int = e match {
+      case a: Attribute if a.semanticEquals(ref) => 1
       // The first child of `ExtractValue` is the complex type to be extracted.
-      case e: ExtractValue if e.children.head.semanticEquals(ref) => false
-      case _ => e.children.exists(hasRefInNonExtractValue)
+      case e: ExtractValue if e.children.head.semanticEquals(ref) => 0
+      case _ => e.children.map(refCount).sum
     }
-    !hasRefInNonExtractValue(expr)
+    refCount(expr)
   }
 
   /**
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
index dd075837d51..baa7c94069a 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
@@ -143,6 +143,17 @@ class CollapseProjectSuite extends PlanTest {
       .select(($"a" + ($"a" + 1)).as("add"))
       .analyze
     comparePlans(optimized2, expected2)
+
+    // referencing `CreateStruct` only once in non-extract expression is OK.
+    val query3 = testRelation
+      .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"))
+      .select($"struct", $"struct".getField("a"))
+      .analyze
+    val optimized3 = Optimize.execute(query3)
+    val expected3 = testRelation
+      .select(namedStruct("a", $"a", "a_plus_1", $"a" + 1).as("struct"), 
$"a".as("struct.a"))
+      .analyze
+    comparePlans(optimized3, expected3)
   }
 
   test("preserve top-level alias metadata while collapsing projects") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-36718][SQL][FOLLOWUP] Improve the extract-only check in CollapseProject

Reply via email to