This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d5b9fc119cc [SPARK-41178][SQL] Fix parser rule precedence between JOIN and comma d5b9fc119cc is described below commit d5b9fc119cc4eedff35134ecb64b3123b70a689d Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Fri Nov 18 11:26:19 2022 +0800 [SPARK-41178][SQL] Fix parser rule precedence between JOIN and comma ### What changes were proposed in this pull request? This PR fixes a long-standing parser bug in Spark: `JOIN` should take precedence over comma when combining relations. For example, `FROM t1, t2 JOIN t3` should result to `t1 X (t2 X t3)`. However, it's `(t1 X t2) X t3` today. You can easily verify this behavior in other databases by running the query `SELECT * FROM t1, t2 JOIN t3 ON t1.c and t3.c`. It should fail as `t1.c` is not available when joining t2 and t3. I tested MySQL, Oracle and PostgreSQL, all of them fail, but Spark can run this query due to the wrong join order. However, this bug has a large impact: it changes the join order which can lead to worse performance or unexpected analysis errors. To be safe, this PR hides the fix under a new ANSI sub-config. ### Why are the changes needed? bug fix ### Does this PR introduce _any_ user-facing change? No, as the fix is turned off by default. ### How was this patch tested? new tests Closes #38691 from cloud-fan/join. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/parser/AstBuilder.scala | 8 +++++-- .../org/apache/spark/sql/internal/SQLConf.scala | 14 ++++++++++-- .../sql/catalyst/parser/PlanParserSuite.scala | 26 ++++++++++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index af2097b5d0f..4adb70bc390 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -914,7 +914,11 @@ class AstBuilder extends SqlBaseParserBaseVisitor[AnyRef] with SQLConfHelper wit */ override def visitFromClause(ctx: FromClauseContext): LogicalPlan = withOrigin(ctx) { val from = ctx.relation.asScala.foldLeft(null: LogicalPlan) { (left, relation) => - val right = plan(relation.relationPrimary) + val right = if (conf.ansiRelationPrecedence) { + visitRelation(relation) + } else { + plan(relation.relationPrimary) + } val join = right.optionalMap(left) { (left, right) => if (relation.LATERAL != null) { if (!relation.relationPrimary.isInstanceOf[AliasedQueryContext]) { @@ -925,7 +929,7 @@ class AstBuilder extends SqlBaseParserBaseVisitor[AnyRef] with SQLConfHelper wit Join(left, right, Inner, None, JoinHint.NONE) } } - withJoinRelations(join, relation) + if (conf.ansiRelationPrecedence) join else withJoinRelations(join, relation) } if (ctx.pivotClause() != null) { if (ctx.unpivotClause() != null) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index f6c3f902409..2f4b03cf591 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2984,8 +2984,16 @@ object SQLConf { .createWithDefault(false) val DOUBLE_QUOTED_IDENTIFIERS = buildConf("spark.sql.ansi.doubleQuotedIdentifiers") - .doc("When true, Spark SQL reads literals enclosed in double quoted (\") as identifiers. " + - "When false they are read as string literals.") + .doc(s"When true and '${ANSI_ENABLED.key}' is true, Spark SQL reads literals enclosed in " + + "double quoted (\") as identifiers. When false they are read as string literals.") + .version("3.4.0") + .booleanConf + .createWithDefault(false) + + val ANSI_RELATION_PRECEDENCE = buildConf("spark.sql.ansi.relationPrecedence") + .doc(s"When true and '${ANSI_ENABLED.key}' is true, JOIN takes precedence over comma when " + + "combining relation. For example, `t1, t2 JOIN t3` should result to `t1 X (t2 X t3)`. If " + + "the config is false, the result is `(t1 X t2) X t3`.") .version("3.4.0") .booleanConf .createWithDefault(false) @@ -4684,6 +4692,8 @@ class SQLConf extends Serializable with Logging { def doubleQuotedIdentifiers: Boolean = ansiEnabled && getConf(DOUBLE_QUOTED_IDENTIFIERS) + def ansiRelationPrecedence: Boolean = ansiEnabled && getConf(ANSI_RELATION_PRECEDENCE) + def timestampType: AtomicType = getConf(TIMESTAMP_TYPE) match { case "TIMESTAMP_LTZ" => // For historical reason, the TimestampType maps to TIMESTAMP WITH LOCAL TIME ZONE diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 968e2227234..11590e465c2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -704,6 +704,32 @@ class PlanParserSuite extends AnalysisTest { .join(table("t2"), Inner, Option($"t1.col1" === $"t2.col2")) .select(star())) + assertEqual( + "select * from t1 JOIN t2, t3 join t2 on t1.col1 = t2.col2", + table("t1") + .join(table("t2")) + .join(table("t3")) + .join(table("t2"), Inner, Option($"t1.col1" === $"t2.col2")) + .select(star())) + + // Implicit joins - ANSI mode + withSQLConf( + SQLConf.ANSI_ENABLED.key -> "true", + SQLConf.ANSI_RELATION_PRECEDENCE.key -> "true") { + + assertEqual( + "select * from t1, t3 join t2 on t1.col1 = t2.col2", + table("t1").join( + table("t3").join(table("t2"), Inner, Option($"t1.col1" === $"t2.col2"))) + .select(star())) + + assertEqual( + "select * from t1 JOIN t2, t3 join t2 on t1.col1 = t2.col2", + table("t1").join(table("t2")).join( + table("t3").join(table("t2"), Inner, Option($"t1.col1" === $"t2.col2"))) + .select(star())) + } + // Test lateral join with join conditions assertEqual( s"select * from t join lateral (select * from u) uu on true", --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org