Repository: spark Updated Branches: refs/heads/master 5cfabec87 -> 961342489
[SPARK-16059][R] Add `monotonically_increasing_id` function in SparkR ## What changes were proposed in this pull request? This PR adds `monotonically_increasing_id` column function in SparkR for API parity. After this PR, SparkR supports the followings. ```r > df <- read.json("examples/src/main/resources/people.json") > collect(select(df, monotonically_increasing_id(), df$name, df$age)) monotonically_increasing_id() name age 1 0 Michael NA 2 1 Andy 30 3 2 Justin 19 ``` ## How was this patch tested? Pass the Jenkins tests (with added testcase). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13774 from dongjoon-hyun/SPARK-16059. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/96134248 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/96134248 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/96134248 Branch: refs/heads/master Commit: 9613424898fd2a586156bc4eb48e255749774f20 Parents: 5cfabec Author: Dongjoon Hyun <dongj...@apache.org> Authored: Mon Jun 20 11:12:41 2016 -0700 Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu> Committed: Mon Jun 20 11:12:41 2016 -0700 ---------------------------------------------------------------------- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 27 ++++++++++++++++++++++++++ R/pkg/R/generics.R | 5 +++++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 2 +- 4 files changed, 34 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/96134248/R/pkg/NAMESPACE ---------------------------------------------------------------------- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 82e56ca..0cfe190 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -218,6 +218,7 @@ exportMethods("%in%", "mean", "min", "minute", + "monotonically_increasing_id", "month", "months_between", "n", http://git-wip-us.apache.org/repos/asf/spark/blob/96134248/R/pkg/R/functions.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a779127..0fb38bc 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -911,6 +911,33 @@ setMethod("minute", column(jc) }) +#' monotonically_increasing_id +#' +#' Return a column that generates monotonically increasing 64-bit integers. +#' +#' The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. +#' The current implementation puts the partition ID in the upper 31 bits, and the record number +#' within each partition in the lower 33 bits. The assumption is that the SparkDataFrame has +#' less than 1 billion partitions, and each partition has less than 8 billion records. +#' +#' As an example, consider a SparkDataFrame with two partitions, each with 3 records. +#' This expression would return the following IDs: +#' 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. +#' +#' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL. +#' +#' @rdname monotonically_increasing_id +#' @name monotonically_increasing_id +#' @family misc_funcs +#' @export +#' @examples \dontrun{select(df, monotonically_increasing_id())} +setMethod("monotonically_increasing_id", + signature(x = "missing"), + function() { + jc <- callJStatic("org.apache.spark.sql.functions", "monotonically_increasing_id") + column(jc) + }) + #' month #' #' Extracts the month as an integer from a given date/timestamp/string. http://git-wip-us.apache.org/repos/asf/spark/blob/96134248/R/pkg/R/generics.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 6e754af..37d0556 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -993,6 +993,11 @@ setGeneric("md5", function(x) { standardGeneric("md5") }) #' @export setGeneric("minute", function(x) { standardGeneric("minute") }) +#' @rdname monotonically_increasing_id +#' @export +setGeneric("monotonically_increasing_id", + function(x) { standardGeneric("monotonically_increasing_id") }) + #' @rdname month #' @export setGeneric("month", function(x) { standardGeneric("month") }) http://git-wip-us.apache.org/repos/asf/spark/blob/96134248/R/pkg/inst/tests/testthat/test_sparkSQL.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index fcc2ab3..c5c5a06 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1047,7 +1047,7 @@ test_that("column functions", { c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c) c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c) c7 <- mean(c) + min(c) + month(c) + negate(c) + quarter(c) - c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id() c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c) c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c) c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org