This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b309db092cbb [SPARK-50682][SQL] Inner Alias should be canonicalized
b309db092cbb is described below
commit b309db092cbb67fedb7cec9ab0ebcd07d8d95b04
Author: Wenchen Fan <[email protected]>
AuthorDate: Fri Dec 27 15:08:29 2024 +0900
[SPARK-50682][SQL] Inner Alias should be canonicalized
### What changes were proposed in this pull request?
This PR fixes plan canonicalization to normalize the expr ID of inner
Aliases to 0. We should not assume that all inner Aliases will be removed by
the analyzer, as Alias with metadata won't be removed.
### Why are the changes needed?
To avoid cache misses that should not happen.
### Does this PR introduce _any_ user-facing change?
no, table cache is a perf feature.
### How was this patch tested?
a new test
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49309 from cloud-fan/alias.
Authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../org/apache/spark/sql/catalyst/plans/QueryPlan.scala | 6 ++++++
.../scala/org/apache/spark/sql/DatasetCacheSuite.scala | 16 ++++++++++++++++
2 files changed, 22 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 23813d94c549..40244595da57 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -724,6 +724,12 @@ object QueryPlan extends PredicateHelper {
} else {
ar.withExprId(ExprId(ordinal))
}
+
+ // Top-level Alias is already handled by `QueryPlan#doCanonicalize`. For
inner Alias, the id
+ // doesn't matter and we normalize it to 0 here.
+ case a: Alias =>
+ Alias(a.child, a.name)(
+ ExprId(0), a.qualifier, a.explicitMetadata,
a.nonInheritableMetadataKeys)
}.canonicalized.asInstanceOf[T]
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
index bda8c7f26082..9d8aaf8d90e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -25,6 +25,7 @@ import
org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableS
import org.apache.spark.sql.functions._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types.Metadata
import org.apache.spark.storage.StorageLevel
import org.apache.spark.tags.SlowSQLTest
@@ -312,4 +313,19 @@ class DatasetCacheSuite extends QueryTest
}
}
}
+
+ test("SPARK-50682: inner Alias should be canonicalized") {
+ // Put a metadata in the Alias so that it won't be removed by the analyzer.
+ val metadata = Metadata.fromJson("""{"k": "v"}""")
+ val df1 = spark.range(5).select(struct($"id".as("name", metadata)))
+ df1.cache()
+ // This is exactly the same as df1.
+ val df2 = spark.range(5).select(struct($"id".as("name", metadata)))
+
assert(df2.queryExecution.executedPlan.exists(_.isInstanceOf[InMemoryTableScanExec]))
+
+ val metadata2 = Metadata.fromJson("""{"k2": "v2"}""")
+ // Same with df1 except for the Alias metadata
+ val df3 = spark.range(5).select(struct($"id".as("name", metadata2)))
+
assert(!df3.queryExecution.executedPlan.exists(_.isInstanceOf[InMemoryTableScanExec]))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]