Github user felixcheung commented on a diff in the pull request:
https://github.com/apache/spark/pull/19816#discussion_r153021622
--- Diff: R/pkg/tests/fulltests/test_sparkSQL.R ---
@@ -3078,101 +3091,117 @@ test_that("coalesce, repartition, numPartitions",
{
})
test_that("gapply() and gapplyCollect() on a DataFrame", {
- df <- createDataFrame(
- list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3",
0.3)),
- c("a", "b", "c", "d"))
- expected <- collect(df)
- df1 <- gapply(df, "a", function(key, x) { x }, schema(df))
- actual <- collect(df1)
- expect_identical(actual, expected)
-
- df1Collect <- gapplyCollect(df, list("a"), function(key, x) { x })
- expect_identical(df1Collect, expected)
-
- # gapply on empty grouping columns.
- df1 <- gapply(df, c(), function(key, x) { x }, schema(df))
- actual <- collect(df1)
- expect_identical(actual, expected)
-
- # Computes the sum of second column by grouping on the first and third
columns
- # and checks if the sum is larger than 2
- schemas <- list(structType(structField("a", "integer"), structField("e",
"boolean")),
- "a INT, e BOOLEAN")
- for (schema in schemas) {
- df2 <- gapply(
+ # The tasks here launch R workers with shuffles. So, we decrease the
number of shuffle
--- End diff --
yes, sounds like we should
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]