Github user felixcheung commented on a diff in the pull request:
https://github.com/apache/spark/pull/19816#discussion_r153021587
--- Diff: R/pkg/tests/fulltests/test_sparkSQL.R ---
@@ -3021,41 +3021,54 @@ test_that("dapplyCollect() on DataFrame with a
binary column", {
})
test_that("repartition by columns on DataFrame", {
- df <- createDataFrame(
- list(list(1L, 1, "1", 0.1), list(1L, 2, "2", 0.2), list(3L, 3, "3",
0.3)),
- c("a", "b", "c", "d"))
-
- # no column and number of partitions specified
- retError <- tryCatch(repartition(df), error = function(e) e)
- expect_equal(grepl
- ("Please, specify the number of partitions and/or a column\\(s\\)",
retError), TRUE)
-
- # repartition by column and number of partitions
- actual <- repartition(df, 3, col = df$"a")
-
- # Checking that at least the dimensions are identical
- expect_identical(dim(df), dim(actual))
- expect_equal(getNumPartitions(actual), 3L)
-
- # repartition by number of partitions
- actual <- repartition(df, 13L)
- expect_identical(dim(df), dim(actual))
- expect_equal(getNumPartitions(actual), 13L)
-
- expect_equal(getNumPartitions(coalesce(actual, 1L)), 1L)
-
- # a test case with a column and dapply
- schema <- structType(structField("a", "integer"), structField("avg",
"double"))
- df <- repartition(df, col = df$"a")
- df1 <- dapply(
- df,
- function(x) {
- y <- (data.frame(x$a[1], mean(x$b)))
- },
- schema)
+ # The tasks here launch R workers with shuffles. So, we decrease the
number of shuffle
+ # partitions to reduce the number of the tasks to speed up the test.
This is particularly
+ # slow on Windows because the R workers are unable to be forked. See
also SPARK-21693.
+ conf <- callJMethod(sparkSession, "conf")
+ value <- callJMethod(conf, "get", "spark.sql.shuffle.partitions")
--- End diff --
call this shufflepartitionsvalue?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]