Github user felixcheung commented on a diff in the pull request:
https://github.com/apache/spark/pull/20902#discussion_r177317996
--- Diff: R/pkg/R/DataFrame.R ---
@@ -759,6 +759,67 @@ setMethod("repartition",
dataFrame(sdf)
})
+
+#' Repartition by range
+#'
+#' The following options for repartition by range are possible:
+#' \itemize{
+#' \item{1.} {Return a new SparkDataFrame range partitioned by
+#' the given columns into \code{numPartitions}.}
+#' \item{2.} {Return a new SparkDataFrame range partitioned by the given
column(s),
+#' using \code{spark.sql.shuffle.partitions} as
number of partitions.}
+#'}
+#'
+#' @param x a SparkDataFrame.
+#' @param numPartitions the number of partitions to use.
+#' @param col the column by which the range partitioning will be performed.
+#' @param ... additional column(s) to be used in the range partitioning.
+#'
+#' @family SparkDataFrame functions
+#' @rdname repartitionByRange
+#' @name repartitionByRange
+#' @aliases repartitionByRange,SparkDataFrame-method
+#' @seealso \link{repartition}, \link{coalesce}
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' newDF <- repartitionByRange(df, col = df$col1, df$col2)
+#' newDF <- repartitionByRange(df, 3L, col = df$col1, df$col2)
+#'}
+#' @note repartitionByRange since 2.4.0
+setMethod("repartitionByRange",
+ signature(x = "SparkDataFrame"),
+ function(x, numPartitions = NULL, col = NULL, ...) {
+ if (!is.null(numPartitions) && !is.null(col)) {
+ # number of partitions and columns both are specified
+ if (is.numeric(numPartitions) && class(col) == "Column") {
+ cols <- list(col, ...)
+ jcol <- lapply(cols, function(c) { c@jc })
+ sdf <- callJMethod(x@sdf, "repartitionByRange",
numToInt(numPartitions), jcol)
+ } else {
+ stop(paste("numPartitions and col must be numeric and
Column; however, got",
+ class(numPartitions), "and", class(col)))
+ }
+ } else if (!is.null(col)) {
+ # only columns are specified
+ if (class(col) == "Column") {
+ cols <- list(col, ...)
+ jcol <- lapply(cols, function(c) { c@jc })
+ sdf <- callJMethod(x@sdf, "repartitionByRange", jcol)
--- End diff --
cool, some duplication but I think unavoidable for clarity
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]