Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/19816#discussion_r153021587 --- Diff: R/pkg/tests/fulltests/test_sparkSQL.R --- @@ -3021,41 +3021,54 @@ test_that("dapplyCollect() on DataFrame with a binary column", { }) test_that("repartition by columns on DataFrame", { - df <- createDataFrame( - list(list(1L, 1, "1", 0.1), list(1L, 2, "2", 0.2), list(3L, 3, "3", 0.3)), - c("a", "b", "c", "d")) - - # no column and number of partitions specified - retError <- tryCatch(repartition(df), error = function(e) e) - expect_equal(grepl - ("Please, specify the number of partitions and/or a column\\(s\\)", retError), TRUE) - - # repartition by column and number of partitions - actual <- repartition(df, 3, col = df$"a") - - # Checking that at least the dimensions are identical - expect_identical(dim(df), dim(actual)) - expect_equal(getNumPartitions(actual), 3L) - - # repartition by number of partitions - actual <- repartition(df, 13L) - expect_identical(dim(df), dim(actual)) - expect_equal(getNumPartitions(actual), 13L) - - expect_equal(getNumPartitions(coalesce(actual, 1L)), 1L) - - # a test case with a column and dapply - schema <- structType(structField("a", "integer"), structField("avg", "double")) - df <- repartition(df, col = df$"a") - df1 <- dapply( - df, - function(x) { - y <- (data.frame(x$a[1], mean(x$b))) - }, - schema) + # The tasks here launch R workers with shuffles. So, we decrease the number of shuffle + # partitions to reduce the number of the tasks to speed up the test. This is particularly + # slow on Windows because the R workers are unable to be forked. See also SPARK-21693. + conf <- callJMethod(sparkSession, "conf") + value <- callJMethod(conf, "get", "spark.sql.shuffle.partitions") --- End diff -- call this shufflepartitionsvalue?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org