Repository: spark Updated Branches: refs/heads/master 60e9b2bdd -> 9c8109ef4
[SPARK-21555][SQL] RuntimeReplaceable should be compared semantically by its canonicalized child ## What changes were proposed in this pull request? When there are aliases (these aliases were added for nested fields) as parameters in `RuntimeReplaceable`, as they are not in the children expression, those aliases can't be cleaned up in analyzer rule `CleanupAliases`. An expression `nvl(foo.foo1, "value")` can be resolved to two semantically different expressions in a group by query because they contain different aliases. Because those aliases are not children of `RuntimeReplaceable` which is an `UnaryExpression`. So we can't trim the aliases out by simple transforming the expressions in `CleanupAliases`. If we want to replace the non-children aliases in `RuntimeReplaceable`, we need to add more codes to `RuntimeReplaceable` and modify all expressions of `RuntimeReplaceable`. It makes the interface ugly IMO. Consider those aliases will be replaced later at optimization and so they're no harm, this patch chooses to simply override `canonicalized` of `RuntimeReplaceable`. One concern is about `CleanupAliases`. Because it actually cannot clean up ALL aliases inside a plan. To make caller of this rule notice that, this patch adds a comment to `CleanupAliases`. ## How was this patch tested? Added test. Author: Liang-Chi Hsieh <[email protected]> Closes #18761 from viirya/SPARK-21555. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c8109ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c8109ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c8109ef Branch: refs/heads/master Commit: 9c8109ef414c92553335bb1e90e9681e142128a4 Parents: 60e9b2b Author: Liang-Chi Hsieh <[email protected]> Authored: Sat Jul 29 10:02:56 2017 -0700 Committer: gatorsmile <[email protected]> Committed: Sat Jul 29 10:02:56 2017 -0700 ---------------------------------------------------------------------- .../spark/sql/catalyst/analysis/Analyzer.scala | 4 +++- .../sql/catalyst/expressions/Expression.scala | 4 ++++ .../inputs/sql-compatibility-functions.sql | 4 ++++ .../results/sql-compatibility-functions.sql.out | 18 +++++++++++++++++- 4 files changed, 28 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/9c8109ef/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 913d846..f987ed8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -2234,7 +2234,9 @@ object EliminateUnions extends Rule[LogicalPlan] { /** * Cleans up unnecessary Aliases inside the plan. Basically we only need Alias as a top level * expression in Project(project list) or Aggregate(aggregate expressions) or - * Window(window expressions). + * Window(window expressions). Notice that if an expression has other expression parameters which + * are not in its `children`, e.g. `RuntimeReplaceable`, the transformation for Aliases in this + * rule can't work for those parameters. */ object CleanupAliases extends Rule[LogicalPlan] { private def trimAliases(e: Expression): Expression = { http://git-wip-us.apache.org/repos/asf/spark/blob/9c8109ef/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index b847ef7..74c4cdd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -241,6 +241,10 @@ trait RuntimeReplaceable extends UnaryExpression with Unevaluable { override def nullable: Boolean = child.nullable override def foldable: Boolean = child.foldable override def dataType: DataType = child.dataType + // As this expression gets replaced at optimization with its `child" expression, + // two `RuntimeReplaceable` are considered to be semantically equal if their "child" expressions + // are semantically equal. + override lazy val canonicalized: Expression = child.canonicalized } http://git-wip-us.apache.org/repos/asf/spark/blob/9c8109ef/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql index 2b5b692..f146103 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql @@ -23,3 +23,7 @@ SELECT float(1), double(1), decimal(1); SELECT date("2014-04-04"), timestamp(date("2014-04-04")); -- error handling: only one argument SELECT string(1, 2); + +-- SPARK-21555: RuntimeReplaceable used in group by +CREATE TEMPORARY VIEW tempView1 AS VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st); +SELECT nvl(st.col1, "value"), count(*) FROM from tempView1 GROUP BY nvl(st.col1, "value"); http://git-wip-us.apache.org/repos/asf/spark/blob/9c8109ef/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out index 732b110..e035505 100644 --- a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 13 +-- Number of queries: 15 -- !query 0 @@ -122,3 +122,19 @@ struct<> -- !query 12 output org.apache.spark.sql.AnalysisException Function string accepts only one argument; line 1 pos 7 + + +-- !query 13 +CREATE TEMPORARY VIEW tempView1 AS VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st) +-- !query 13 schema +struct<> +-- !query 13 output + + + +-- !query 14 +SELECT nvl(st.col1, "value"), count(*) FROM from tempView1 GROUP BY nvl(st.col1, "value") +-- !query 14 schema +struct<nvl(tempview1.`st`.`col1` AS `col1`, 'value'):string,FROM:bigint> +-- !query 14 output +gamma 1 --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
