[spark] branch master updated: [SPARK-34776][SQL] Nested column pruning should not prune Window produced attributes

dongjoon Fri, 19 Mar 2021 11:44:45 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 7a8a600  [SPARK-34776][SQL] Nested column pruning should not prune 
Window produced attributes
7a8a600 is described below

commit 7a8a600995ddee32f0a9c81a97be3fc2bca21928
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Fri Mar 19 11:44:02 2021 -0700

    [SPARK-34776][SQL] Nested column pruning should not prune Window produced 
attributes
    
    ### What changes were proposed in this pull request?
    
    This patch proposes to fix a bug related to `NestedColumnAliasing`. The 
root cause is `Window`  doesn't override `producedAttributes` so 
`NestedColumnAliasing` rule wrongly prune attributes produced by `Window`.
    
    The master and branch-3.1 both have this issue.
    
    ### Why are the changes needed?
    
    It is needed to fix a bug of nested column pruning.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Unit test.
    
    Closes #31897 from viirya/SPARK-34776.
    
    Authored-by: Liang-Chi Hsieh <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../plans/logical/basicLogicalOperators.scala         |  2 ++
 .../scala/org/apache/spark/sql/DataFrameSuite.scala   | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 99df8c4..0946773 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -670,6 +670,8 @@ case class Window(
   override def output: Seq[Attribute] =
     child.output ++ windowExpressions.map(_.toAttribute)
 
+  override def producedAttributes: AttributeSet = windowOutputSet
+
   def windowOutputSet: AttributeSet = 
AttributeSet(windowExpressions.map(_.toAttribute))
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index d754292..6e279f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.execution.{FilterExec, 
QueryExecution, WholeStageCod
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, 
ReusedExchangeExec, ShuffleExchangeExec}
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, 
SharedSparkSession}
@@ -2736,6 +2737,24 @@ class DataFrameSuite extends QueryTest
     val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
     checkAnswer(df.select(col2), Row(1) :: Row(2) :: Row(3) :: Nil)
   }
+
+  test("SPARK-34776: Nested column pruning should not prune Window produced 
attributes") {
+    val df = Seq(
+      ("t1", "123", "bob"),
+      ("t1", "456", "bob"),
+      ("t2", "123", "sam")
+    ).toDF("type", "value", "name")
+
+    val test = df.select(
+      $"*",
+      struct(count($"*").over(Window.partitionBy($"type", $"value", $"name"))
+        .as("count"), $"name").as("name_count")
+    ).select(
+      $"*",
+      max($"name_count").over(Window.partitionBy($"type", 
$"value")).as("best_name")
+    )
+    checkAnswer(test.select($"best_name.name"), Row("bob") :: Row("bob") :: 
Row("sam") :: Nil)
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-34776][SQL] Nested column pruning should not prune Window produced attributes

Reply via email to