ahshahid commented on code in PR #45343:
URL: https://github.com/apache/spark/pull/45343#discussion_r1520395436
##########
sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala:
##########
@@ -498,4 +559,70 @@ class DataFrameSelfJoinSuite extends QueryTest with
SharedSparkSession {
assert(df1.join(df2, $"t1.i" === $"t2.i").cache().count() == 1)
}
}
+
+ test("SPARK_47217: deduplication of project causes ambiguity in resolution")
{
+ val df = Seq((1, 2)).toDF("a", "b")
+ val df2 = df.select(df("a").as("aa"), df("b").as("bb"))
+ val df3 = df2.join(df, df2("bb") === df("b")).select(df2("aa"), df("a"))
+ checkAnswer(
+ df3,
+ Row(1, 1) :: Nil)
+ }
+
+ test("SPARK-47217. deduplication in nested joins with join attribute
aliased") {
+ val df1 = Seq((1, 2)).toDF("a", "b")
+ val df2 = Seq((1, 2)).toDF("aa", "bb")
+ val df1Joindf2 = df1.join(df2, df1("a") ===
df2("aa")).select(df1("a").as("aaa"),
+ df2("aa"), df1("b"))
+
+ assertCorrectResolution(df1Joindf2.join(df1, df1Joindf2("aaa") ===
df1("a")),
+ Resolution.LeftConditionToLeftLeg, Resolution.RightConditionToRightLeg)
+
+ assertCorrectResolution(df1.join(df1Joindf2, df1Joindf2("aaa") ===
df1("a")),
+ Resolution.LeftConditionToRightLeg, Resolution.RightConditionToLeftLeg)
+
+ val proj1 = df1Joindf2.join(df1, df1Joindf2("aaa") ===
df1("a")).select(df1Joindf2("aa"),
+ df1("a")).queryExecution.analyzed.asInstanceOf[Project]
+ val join1 = proj1.child.asInstanceOf[Join]
+ assert(proj1.projectList(0).references.subsetOf(join1.left.outputSet))
+ assert(proj1.projectList(1).references.subsetOf(join1.right.outputSet))
+
+ val proj2 = df1.join(df1Joindf2, df1Joindf2("aaa") ===
df1("a")).select(df1Joindf2("aa"),
+ df1("a")).queryExecution.analyzed.asInstanceOf[Project]
+ val join2 = proj2.child.asInstanceOf[Join]
+ assert(proj2.projectList(0).references.subsetOf(join2.right.outputSet))
+ assert(proj2.projectList(1).references.subsetOf(join2.left.outputSet))
+ }
+
+ test("SPARK-47217. deduplication in nested joins without join attribute
aliased") {
+ val df1 = Seq((1, 2)).toDF("a", "b")
+ val df2 = Seq((1, 2)).toDF("aa", "bb")
+ val df1Joindf2 = df1.join(df2, df1("a") === df2("aa")).select(df1("a"),
df2("aa"), df1("b"))
+
+ assertCorrectResolution(df1Joindf2.join(df1, df1Joindf2("a") === df1("a")),
+ Resolution.LeftConditionToLeftLeg, Resolution.RightConditionToRightLeg)
+
+ assertCorrectResolution(df1.join(df1Joindf2, df1Joindf2("a") === df1("a")),
+ Resolution.LeftConditionToRightLeg, Resolution.RightConditionToLeftLeg)
+
+ val proj1 = df1Joindf2.join(df1, df1Joindf2("a") ===
df1("a")).select(df1Joindf2("a"),
+ df1("a")).queryExecution.analyzed.asInstanceOf[Project]
Review Comment:
@peter-toth Also another source of confusion is that join condition is
attempted being resolved on un-deduplicated plan. Once the join plan is
de-duplicated, and then join condition attempted being resolved, the problem
changes from being ambiguous resolution to I suppose both LHS & RHS resolving
to same leg, which is then handled the way of resolution by tag Id.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]