This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new b23198ee6d7 [SPARK-41538][SQL] Metadata column should be appended at the end of project list b23198ee6d7 is described below commit b23198ee6d76cc0486ae810a1d37f0474b74c27c Author: Gengliang Wang <gengli...@apache.org> AuthorDate: Fri Dec 16 10:43:17 2022 +0300 [SPARK-41538][SQL] Metadata column should be appended at the end of project list ### What changes were proposed in this pull request? For the following query: ``` CREATE TABLE table_1 ( a ARRAY<STRING>, s STRUCT<id: STRING>) USING parquet; CREATE VIEW view_1 (id) AS WITH source AS ( SELECT * FROM table_1 ), renamed AS ( SELECT s.id FROM source ) SELECT id FROM renamed; with foo AS ( SELECT 'a' as id ), bar AS ( SELECT 'a' as id ) SELECT 1 FROM foo FULL OUTER JOIN bar USING(id) FULL OUTER JOIN view_1 USING(id) WHERE foo.id IS NOT NULL ``` There will be the following error: ``` class org.apache.spark.sql.types.ArrayType cannot be cast to class org.apache.spark.sql.types.StructType (org.apache.spark.sql.types.ArrayType and org.apache.spark.sql.types.StructType are in unnamed module of loader 'app') java.lang.ClassCastException: class org.apache.spark.sql.types.ArrayType cannot be cast to class org.apache.spark.sql.types.StructType (org.apache.spark.sql.types.ArrayType and org.apache.spark.sql.types.StructType are in unnamed module of loader 'app') at org.apache.spark.sql.catalyst.expressions.GetStructField.childSchema$lzycompute(complexTypeExtractors.scala:108) at org.apache.spark.sql.catalyst.expressions.GetStructField.childSchema(complexTypeExtractors.scala:108) ``` This is caused by the inconsistent metadata column positions in the following two nodes: * Table relation: at the ending position * Project list: at the beginning position <img width="1442" alt="image" src="https://user-images.githubusercontent.com/1097932/207992343-438714bc-e1d1-46f7-9a79-84ab83dd299f.png"> When the InlineCTE rule executes, the metadata column in the project is wrongly combined with the table output. <img width="1438" alt="image" src="https://user-images.githubusercontent.com/1097932/207992431-f4cfc774-4cab-4728-b109-2ebff94e5fe2.png"> Thus the column `a ARRAY<STRING>` is casted as `s STRUCT<id: STRING>` and cause the error. This PR is to fix the issue by putting the Metadata column at the end of project list, so that it is consistent with the table relation. ### Why are the changes needed? Bug fix ### Does this PR introduce _any_ user-facing change? Yes, it fixes a bug in the analysis rule `AddMetadataColumns` ### How was this patch tested? New test case Closes #39081 from gengliangwang/fixMetadata. Authored-by: Gengliang Wang <gengli...@apache.org> Signed-off-by: Max Gekk <max.g...@gmail.com> (cherry picked from commit 172f719fffa84a2528628e08627a02cf8d1fe8a8) Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../spark/sql/catalyst/analysis/Analyzer.scala | 2 +- .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 39 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 0c68dd8839d..c6429077b07 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -970,7 +970,7 @@ class Analyzer(override val catalogManager: CatalogManager) case s: ExposesMetadataColumns => s.withMetadataColumns() case p: Project => val newProj = p.copy( - projectList = p.metadataOutput ++ p.projectList, + projectList = p.projectList ++ p.metadataOutput, child = addMetadataCol(p.child)) newProj.copyTagsFrom(p) newProj diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 5b42d05c237..66f9700e8ac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -4572,6 +4572,45 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark sql("SELECT /*+ hash(t2) */ * FROM t1 join t2 on c1 = c2") } } + + test("SPARK-41538: Metadata column should be appended at the end of project") { + val tableName = "table_1" + val viewName = "view_1" + withTable(tableName) { + withView(viewName) { + sql(s"CREATE TABLE $tableName (a ARRAY<STRING>, s STRUCT<id: STRING>) USING parquet") + val id = "id1" + sql(s"INSERT INTO $tableName values(ARRAY('a'), named_struct('id', '$id'))") + sql( + s""" + |CREATE VIEW $viewName (id) + |AS WITH source AS ( + | SELECT * FROM $tableName + |), + |renamed AS ( + | SELECT s.id FROM source + |) + |SELECT id FROM renamed + |""".stripMargin) + val query = + s""" + |with foo AS ( + | SELECT '$id' as id + |), + |bar AS ( + | SELECT '$id' as id + |) + |SELECT + | 1 + |FROM foo + |FULL OUTER JOIN bar USING(id) + |FULL OUTER JOIN $viewName USING(id) + |WHERE foo.id IS NOT NULL + |""".stripMargin + checkAnswer(sql(query), Row(1)) + } + } + } } case class Foo(bar: Option[String]) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org