This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7a8a600 [SPARK-34776][SQL] Nested column pruning should not prune
Window produced attributes
7a8a600 is described below
commit 7a8a600995ddee32f0a9c81a97be3fc2bca21928
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Fri Mar 19 11:44:02 2021 -0700
[SPARK-34776][SQL] Nested column pruning should not prune Window produced
attributes
### What changes were proposed in this pull request?
This patch proposes to fix a bug related to `NestedColumnAliasing`. The
root cause is `Window` doesn't override `producedAttributes` so
`NestedColumnAliasing` rule wrongly prune attributes produced by `Window`.
The master and branch-3.1 both have this issue.
### Why are the changes needed?
It is needed to fix a bug of nested column pruning.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Unit test.
Closes #31897 from viirya/SPARK-34776.
Authored-by: Liang-Chi Hsieh <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../plans/logical/basicLogicalOperators.scala | 2 ++
.../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++++++++++++++++++
2 files changed, 21 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 99df8c4..0946773 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -670,6 +670,8 @@ case class Window(
override def output: Seq[Attribute] =
child.output ++ windowExpressions.map(_.toAttribute)
+ override def producedAttributes: AttributeSet = windowOutputSet
+
def windowOutputSet: AttributeSet =
AttributeSet(windowExpressions.map(_.toAttribute))
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index d754292..6e279f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.execution.{FilterExec,
QueryExecution, WholeStageCod
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec,
ReusedExchangeExec, ShuffleExchangeExec}
+import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT,
SharedSparkSession}
@@ -2736,6 +2737,24 @@ class DataFrameSuite extends QueryTest
val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
checkAnswer(df.select(col2), Row(1) :: Row(2) :: Row(3) :: Nil)
}
+
+ test("SPARK-34776: Nested column pruning should not prune Window produced
attributes") {
+ val df = Seq(
+ ("t1", "123", "bob"),
+ ("t1", "456", "bob"),
+ ("t2", "123", "sam")
+ ).toDF("type", "value", "name")
+
+ val test = df.select(
+ $"*",
+ struct(count($"*").over(Window.partitionBy($"type", $"value", $"name"))
+ .as("count"), $"name").as("name_count")
+ ).select(
+ $"*",
+ max($"name_count").over(Window.partitionBy($"type",
$"value")).as("best_name")
+ )
+ checkAnswer(test.select($"best_name.name"), Row("bob") :: Row("bob") ::
Row("sam") :: Nil)
+ }
}
case class GroupByKey(a: Int, b: Int)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]