dongjoon-hyun commented on a change in pull request #24442: [SPARK-27547][SQL]
fix DataFrame self-join problems
URL: https://github.com/apache/spark/pull/24442#discussion_r279580410
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
##########
@@ -117,6 +117,57 @@ class DataFrameJoinSuite extends QueryTest with
SharedSQLContext {
.collect().toSeq)
}
+ test("join - self join without manual alias") {
+ val df1 = spark.range(3)
+ val df2 = df1.filter($"id" > 0)
+
+ withSQLConf(
+ SQLConf.RESOLVE_DATASET_COLUMN_REFERENCE.key -> "false",
+ SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+ // `df1("id") > df2("id")` is always false.
+ checkAnswer(df1.join(df2, df1("id") > df2("id")), Nil)
+
+ // `df2("id")` actually points to the column of `df1`.
+ checkAnswer(df1.join(df2).select(df2("id")), Seq(0, 0, 1, 1, 2,
2).map(Row(_)))
+
+ val df3 = df1.filter($"id" <= 2)
+ // `df2("id") < df3("id")` is always false
+ checkAnswer(df1.join(df2).join(df3, df2("id") < df3("id")), Nil)
+ }
+
+ withSQLConf(
+ SQLConf.RESOLVE_DATASET_COLUMN_REFERENCE.key -> "true",
+ SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+ checkAnswer(df1.join(df2, df1("id") > df2("id")), Row(2, 1))
+
+ checkAnswer(df1.join(df2).select(df2("id")), Seq(1, 2, 1, 2, 1,
2).map(Row(_)))
+
+ val df3 = df1.filter($"id" <= 2)
+ checkAnswer(
+ df1.join(df2).join(df3, df2("id") < df3("id")),
+ Row(0, 1, 2) :: Row(1, 1, 2) :: Row(2, 1, 2) :: Nil)
+
+ checkAnswer(
+ df3.join(df1.join(df2), df2("id") < df3("id")),
+ Row(2, 0, 1) :: Row(2, 1, 1) :: Row(2, 2, 1) :: Nil)
+
+ // `df1("id")` is ambiguous here.
+ // Current behavior is: it points to the column of the first `df1`.
+ checkAnswer(
+ df1.join(df2).join(df1, df1("id") > df2("id")),
+ Row(2, 1, 0) :: Row(2, 1, 1) :: Row(2, 1, 2) :: Nil)
Review comment:
This is much better than the current status.
One side effect is that we start to distinguish the `df2`'s rows in
`UNION`ed dataframes like the following, too. Should we allow this?
```scala
scala> val df1 = spark.range(3)
scala> val df2 = df1.filter($"id" > 0)
scala> df1.union(df2).join(df1, df1("id") > df2("id")).show
+---+---+
| id| id|
+---+---+
| 0| 1|
| 0| 2|
| 1| 2|
| 1| 2|
+---+---+
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]