cloud-fan commented on a change in pull request #24442: [SPARK-27547][SQL] fix 
DataFrame self-join problems
URL: https://github.com/apache/spark/pull/24442#discussion_r279731583
 
 

 ##########
 File path: 
sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
 ##########
 @@ -117,6 +117,57 @@ class DataFrameJoinSuite extends QueryTest with 
SharedSQLContext {
         .collect().toSeq)
   }
 
+  test("join - self join without manual alias") {
+    val df1 = spark.range(3)
+    val df2 = df1.filter($"id" > 0)
+
+    withSQLConf(
+      SQLConf.RESOLVE_DATASET_COLUMN_REFERENCE.key -> "false",
+      SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      // `df1("id") > df2("id")` is always false.
+      checkAnswer(df1.join(df2, df1("id") > df2("id")), Nil)
+
+      // `df2("id")` actually points to the column of `df1`.
+      checkAnswer(df1.join(df2).select(df2("id")), Seq(0, 0, 1, 1, 2, 
2).map(Row(_)))
+
+      val df3 = df1.filter($"id" <= 2)
+      // `df2("id") < df3("id")` is always false
+      checkAnswer(df1.join(df2).join(df3, df2("id") < df3("id")), Nil)
+    }
+
+    withSQLConf(
+      SQLConf.RESOLVE_DATASET_COLUMN_REFERENCE.key -> "true",
+      SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      checkAnswer(df1.join(df2, df1("id") > df2("id")), Row(2, 1))
+
+      checkAnswer(df1.join(df2).select(df2("id")), Seq(1, 2, 1, 2, 1, 
2).map(Row(_)))
+
+      val df3 = df1.filter($"id" <= 2)
+      checkAnswer(
+        df1.join(df2).join(df3, df2("id") < df3("id")),
+        Row(0, 1, 2) :: Row(1, 1, 2) :: Row(2, 1, 2) :: Nil)
+
+      checkAnswer(
+        df3.join(df1.join(df2), df2("id") < df3("id")),
+        Row(2, 0, 1) :: Row(2, 1, 1) :: Row(2, 2, 1) :: Nil)
+
+      // `df1("id")` is ambiguous here.
+      // Current behavior is: it points to the column of the first `df1`.
+      checkAnswer(
+        df1.join(df2).join(df1, df1("id") > df2("id")),
+        Row(2, 1, 0) :: Row(2, 1, 1) :: Row(2, 1, 2) :: Nil)
 
 Review comment:
   To be honest I don't really know what's the expected result here. 
`df1("id")` should point to the right side `df1` of the join, and seems it's OK 
if `df2(id)` points to the unioned df. Before this PR this query returns no 
result, which is confusing as well.
   
   If we change `Union.output` and generate new attributes to it, then this 
query throws analysis exception, which seems more reasonable.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to