dilipbiswal commented on a change in pull request #24087:
[SPARK-27096][SQL][FOLLOWUP] Do the correct validation of join types in R side
and fix join docs for scala, python and r
URL: https://github.com/apache/spark/pull/24087#discussion_r265848916
##########
File path: R/pkg/tests/fulltests/test_sparkSQL.R
##########
@@ -2356,40 +2356,96 @@ test_that("join(), crossJoin() and merge() on a
DataFrame", {
expect_equal(names(joined2), c("age", "name", "name", "test"))
expect_equal(count(joined2), 3)
- joined3 <- join(df, df2, df$name == df2$name, "rightouter")
+ joined3 <- join(df, df2, df$name == df2$name, "right")
expect_equal(names(joined3), c("age", "name", "name", "test"))
expect_equal(count(joined3), 4)
expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-
- joined4 <- select(join(df, df2, df$name == df2$name, "outer"),
- alias(df$age + 5, "newAge"), df$name, df2$test)
- expect_equal(names(joined4), c("newAge", "name", "test"))
+
+ joined4 <- join(df, df2, df$name == df2$name, "right_outer")
+ expect_equal(names(joined4), c("age", "name", "name", "test"))
expect_equal(count(joined4), 4)
- expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
+ expect_true(is.na(collect(orderBy(joined4, joined4$age))$age[2]))
- joined5 <- join(df, df2, df$name == df2$name, "leftouter")
+ joined5 <- join(df, df2, df$name == df2$name, "rightouter")
expect_equal(names(joined5), c("age", "name", "name", "test"))
- expect_equal(count(joined5), 3)
- expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
-
- joined6 <- join(df, df2, df$name == df2$name, "inner")
- expect_equal(names(joined6), c("age", "name", "name", "test"))
- expect_equal(count(joined6), 3)
+ expect_equal(count(joined5), 4)
+ expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[2]))
- joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
- expect_equal(names(joined7), c("age", "name"))
- expect_equal(count(joined7), 3)
- joined8 <- join(df, df2, df$name == df2$name, "left_outer")
- expect_equal(names(joined8), c("age", "name", "name", "test"))
- expect_equal(count(joined8), 3)
- expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
-
- joined9 <- join(df, df2, df$name == df2$name, "right_outer")
- expect_equal(names(joined9), c("age", "name", "name", "test"))
+ joined6 <- select(join(df, df2, df$name == df2$name, "outer"),
+ alias(df$age + 5, "newAge"), df$name, df2$test)
+ expect_equal(names(joined6), c("newAge", "name", "test"))
+ expect_equal(count(joined6), 4)
+ expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24)
+
+ joined7 <- select(join(df, df2, df$name == df2$name, "full"),
+ alias(df$age + 5, "newAge"), df$name, df2$test)
+ expect_equal(names(joined7), c("newAge", "name", "test"))
+ expect_equal(count(joined7), 4)
+ expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24)
+
+ joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"),
+ alias(df$age + 5, "newAge"), df$name, df2$test)
+ expect_equal(names(joined8), c("newAge", "name", "test"))
+ expect_equal(count(joined8), 4)
+ expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24)
+
+ joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"),
+ alias(df$age + 5, "newAge"), df$name, df2$test)
+ expect_equal(names(joined9), c("newAge", "name", "test"))
expect_equal(count(joined9), 4)
- expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
-
+ expect_equal(collect(orderBy(joined9, joined9$name))$newAge[3], 24)
+
+ joined10 <- join(df, df2, df$name == df2$name, "left")
+ expect_equal(names(joined10), c("age", "name", "name", "test"))
+ expect_equal(count(joined10), 3)
+ expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1]))
+
+ joined11 <- join(df, df2, df$name == df2$name, "leftouter")
+ expect_equal(names(joined11), c("age", "name", "name", "test"))
+ expect_equal(count(joined11), 3)
+ expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1]))
+
+ joined12 <- join(df, df2, df$name == df2$name, "left_outer")
+ expect_equal(names(joined12), c("age", "name", "name", "test"))
+ expect_equal(count(joined12), 3)
+ expect_true(is.na(collect(orderBy(joined12, joined12$age))$age[1]))
+
+ joined13 <- join(df, df2, df$name == df2$name, "inner")
+ expect_equal(names(joined13), c("age", "name", "name", "test"))
+ expect_equal(count(joined13), 3)
+
+ joined14 <- join(df, df2, df$name == df2$name, "semi")
+ expect_equal(names(joined14), c("age", "name"))
+ expect_equal(count(joined14), 3)
+
+ joined14 <- join(df, df2, df$name == df2$name, "leftsemi")
+ expect_equal(names(joined14), c("age", "name"))
+ expect_equal(count(joined14), 3)
+
+ joined15 <- join(df, df2, df$name == df2$name, "left_semi")
+ expect_equal(names(joined15), c("age", "name"))
+ expect_equal(count(joined15), 3)
+
+ joined16 <- join(df2, df, df2$name == df$name, "anti")
+ expect_equal(names(joined16), c("name", "test"))
+ expect_equal(count(joined16), 1)
+
+ joined17 <- join(df2, df, df2$name == df$name, "leftanti")
+ expect_equal(names(joined17), c("name", "test"))
+ expect_equal(count(joined17), 1)
+
+ joined18 <- join(df2, df, df2$name == df$name, "left_anti")
+ expect_equal(names(joined18), c("name", "test"))
+ expect_equal(count(joined18), 1)
Review comment:
@felixcheung The only reason i added one minimal test for each join type is
because of the validation of join types that is done in the R side. In case we
gate a valid join type wrongly in that code, we would be caught here in this
test. Do you want me to remove some of them ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]