spark git commit: [SPARK-12029][SPARKR] Improve column functions signature, param check, tests, fix doc and add examples

shivaram Sat, 28 Nov 2015 21:02:30 -0800

Repository: spark
Updated Branches:
  refs/heads/master 149cd692e -> 28e46ab46



[SPARK-12029][SPARKR] Improve column functions signature, param check, tests, 
fix doc and add examples

shivaram sun-rui

Author: felixcheung <[email protected]>

Closes #10019 from felixcheung/rfunctionsdoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28e46ab4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28e46ab4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28e46ab4

Branch: refs/heads/master
Commit: 28e46ab46368ea3833c8e805163893bbb6f2a265
Parents: 149cd69
Author: felixcheung <[email protected]>
Authored: Sat Nov 28 21:02:05 2015 -0800
Committer: Shivaram Venkataraman <[email protected]>
Committed: Sat Nov 28 21:02:05 2015 -0800

----------------------------------------------------------------------
 R/pkg/R/functions.R              | 121 +++++++++++++++++++++++++---------
 R/pkg/inst/tests/test_sparkSQL.R |   9 +--
 2 files changed, 96 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/28e46ab4/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index e98e7a0..b30331c 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -878,7 +878,7 @@ setMethod("rtrim",
 #'}
 setMethod("sd",
           signature(x = "Column"),
-          function(x, na.rm = FALSE) {
+          function(x) {
             # In R, sample standard deviation is calculated with the sd() 
function.
             stddev_samp(x)
           })
@@ -1250,7 +1250,7 @@ setMethod("upper",
 #'}
 setMethod("var",
           signature(x = "Column"),
-          function(x, y = NULL, na.rm = FALSE, use) {
+          function(x) {
             # In R, sample variance is calculated with the var() function.
             var_samp(x)
           })
@@ -1467,6 +1467,7 @@ setMethod("pmod", signature(y = "Column"),
 #' @name approxCountDistinct
 #' @return the approximate number of distinct items in a group.
 #' @export
+#' @examples \dontrun{approxCountDistinct(df$c, 0.02)}
 setMethod("approxCountDistinct",
           signature(x = "Column"),
           function(x, rsd = 0.05) {
@@ -1481,14 +1482,16 @@ setMethod("approxCountDistinct",
 #' @name countDistinct
 #' @return the number of distinct items in a group.
 #' @export
+#' @examples \dontrun{countDistinct(df$c)}
 setMethod("countDistinct",
           signature(x = "Column"),
           function(x, ...) {
-            jcol <- lapply(list(...), function (x) {
+            jcols <- lapply(list(...), function (x) {
+              stopifnot(class(x) == "Column")
               x@jc
             })
             jc <- callJStatic("org.apache.spark.sql.functions", 
"countDistinct", x@jc,
-                              jcol)
+                              jcols)
             column(jc)
           })
 
@@ -1501,10 +1504,14 @@ setMethod("countDistinct",
 #' @rdname concat
 #' @name concat
 #' @export
+#' @examples \dontrun{concat(df$strings, df$strings2)}
 setMethod("concat",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function(x) { x@jc })
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
             jc <- callJStatic("org.apache.spark.sql.functions", "concat", 
jcols)
             column(jc)
           })
@@ -1518,11 +1525,15 @@ setMethod("concat",
 #' @rdname greatest
 #' @name greatest
 #' @export
+#' @examples \dontrun{greatest(df$c, df$d)}
 setMethod("greatest",
           signature(x = "Column"),
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
-            jcols <- lapply(list(x, ...), function(x) { x@jc })
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
             jc <- callJStatic("org.apache.spark.sql.functions", "greatest", 
jcols)
             column(jc)
           })
@@ -1530,17 +1541,21 @@ setMethod("greatest",
 #' least
 #'
 #' Returns the least value of the list of column names, skipping null values.
-#' This function takes at least 2 parameters. It will return null iff all 
parameters are null.
+#' This function takes at least 2 parameters. It will return null if all 
parameters are null.
 #'
 #' @family normal_funcs
 #' @rdname least
 #' @name least
 #' @export
+#' @examples \dontrun{least(df$c, df$d)}
 setMethod("least",
           signature(x = "Column"),
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
-            jcols <- lapply(list(x, ...), function(x) { x@jc })
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
             jc <- callJStatic("org.apache.spark.sql.functions", "least", jcols)
             column(jc)
           })
@@ -1549,11 +1564,10 @@ setMethod("least",
 #'
 #' Computes the ceiling of the given value.
 #'
-#' @family math_funcs
 #' @rdname ceil
-#' @name ceil
-#' @aliases ceil
+#' @name ceiling
 #' @export
+#' @examples \dontrun{ceiling(df$c)}
 setMethod("ceiling",
           signature(x = "Column"),
           function(x) {
@@ -1564,11 +1578,10 @@ setMethod("ceiling",
 #'
 #' Computes the signum of the given value.
 #'
-#' @family math_funcs
 #' @rdname signum
-#' @name signum
-#' @aliases signum
+#' @name sign
 #' @export
+#' @examples \dontrun{sign(df$c)}
 setMethod("sign", signature(x = "Column"),
           function(x) {
             signum(x)
@@ -1578,11 +1591,10 @@ setMethod("sign", signature(x = "Column"),
 #'
 #' Aggregate function: returns the number of distinct items in a group.
 #'
-#' @family agg_funcs
 #' @rdname countDistinct
-#' @name countDistinct
-#' @aliases countDistinct
+#' @name n_distinct
 #' @export
+#' @examples \dontrun{n_distinct(df$c)}
 setMethod("n_distinct", signature(x = "Column"),
           function(x, ...) {
             countDistinct(x, ...)
@@ -1592,11 +1604,10 @@ setMethod("n_distinct", signature(x = "Column"),
 #'
 #' Aggregate function: returns the number of items in a group.
 #'
-#' @family agg_funcs
 #' @rdname count
-#' @name count
-#' @aliases count
+#' @name n
 #' @export
+#' @examples \dontrun{n(df$c)}
 setMethod("n", signature(x = "Column"),
           function(x) {
             count(x)
@@ -1617,6 +1628,7 @@ setMethod("n", signature(x = "Column"),
 #' @rdname date_format
 #' @name date_format
 #' @export
+#' @examples \dontrun{date_format(df$t, 'MM/dd/yyy')}
 setMethod("date_format", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "date_format", 
y@jc, x)
@@ -1631,6 +1643,7 @@ setMethod("date_format", signature(y = "Column", x = 
"character"),
 #' @rdname from_utc_timestamp
 #' @name from_utc_timestamp
 #' @export
+#' @examples \dontrun{from_utc_timestamp(df$t, 'PST')}
 setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", 
"from_utc_timestamp", y@jc, x)
@@ -1649,6 +1662,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x 
= "character"),
 #' @rdname instr
 #' @name instr
 #' @export
+#' @examples \dontrun{instr(df$c, 'b')}
 setMethod("instr", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "instr", y@jc, 
x)
@@ -1663,13 +1677,18 @@ setMethod("instr", signature(y = "Column", x = 
"character"),
 #' For example, \code{next_day('2015-07-27', "Sunday")} returns 2015-08-02 
because that is the first
 #' Sunday after 2015-07-27.
 #'
-#' Day of the week parameter is case insensitive, and accepts:
+#' Day of the week parameter is case insensitive, and accepts first three or 
two characters:
 #' "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
 #'
 #' @family datetime_funcs
 #' @rdname next_day
 #' @name next_day
 #' @export
+#' @examples
+#'\dontrun{
+#'next_day(df$d, 'Sun')
+#'next_day(df$d, 'Sunday')
+#'}
 setMethod("next_day", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "next_day", 
y@jc, x)
@@ -1684,6 +1703,7 @@ setMethod("next_day", signature(y = "Column", x = 
"character"),
 #' @rdname to_utc_timestamp
 #' @name to_utc_timestamp
 #' @export
+#' @examples \dontrun{to_utc_timestamp(df$t, 'PST')}
 setMethod("to_utc_timestamp", signature(y = "Column", x = "character"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", 
"to_utc_timestamp", y@jc, x)
@@ -1697,8 +1717,8 @@ setMethod("to_utc_timestamp", signature(y = "Column", x = 
"character"),
 #' @name add_months
 #' @family datetime_funcs
 #' @rdname add_months
-#' @name add_months
 #' @export
+#' @examples \dontrun{add_months(df$d, 1)}
 setMethod("add_months", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "add_months", 
y@jc, as.integer(x))
@@ -1713,6 +1733,7 @@ setMethod("add_months", signature(y = "Column", x = 
"numeric"),
 #' @rdname date_add
 #' @name date_add
 #' @export
+#' @examples \dontrun{date_add(df$d, 1)}
 setMethod("date_add", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "date_add", 
y@jc, as.integer(x))
@@ -1727,6 +1748,7 @@ setMethod("date_add", signature(y = "Column", x = 
"numeric"),
 #' @rdname date_sub
 #' @name date_sub
 #' @export
+#' @examples \dontrun{date_sub(df$d, 1)}
 setMethod("date_sub", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "date_sub", 
y@jc, as.integer(x))
@@ -1735,16 +1757,19 @@ setMethod("date_sub", signature(y = "Column", x = 
"numeric"),
 
 #' format_number
 #'
-#' Formats numeric column x to a format like '#,###,###.##', rounded to d 
decimal places,
+#' Formats numeric column y to a format like '#,###,###.##', rounded to x 
decimal places,
 #' and returns the result as a string column.
 #'
-#' If d is 0, the result has no decimal point or fractional part.
-#' If d < 0, the result will be null.'
+#' If x is 0, the result has no decimal point or fractional part.
+#' If x < 0, the result will be null.
 #'
+#' @param y column to format
+#' @param x number of decimal place to format to
 #' @family string_funcs
 #' @rdname format_number
 #' @name format_number
 #' @export
+#' @examples \dontrun{format_number(df$n, 4)}
 setMethod("format_number", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1764,6 +1789,7 @@ setMethod("format_number", signature(y = "Column", x = 
"numeric"),
 #' @rdname sha2
 #' @name sha2
 #' @export
+#' @examples \dontrun{sha2(df$c, 256)}
 setMethod("sha2", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "sha2", y@jc, 
as.integer(x))
@@ -1779,6 +1805,7 @@ setMethod("sha2", signature(y = "Column", x = "numeric"),
 #' @rdname shiftLeft
 #' @name shiftLeft
 #' @export
+#' @examples \dontrun{shiftLeft(df$c, 1)}
 setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1796,6 +1823,7 @@ setMethod("shiftLeft", signature(y = "Column", x = 
"numeric"),
 #' @rdname shiftRight
 #' @name shiftRight
 #' @export
+#' @examples \dontrun{shiftRight(df$c, 1)}
 setMethod("shiftRight", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1813,6 +1841,7 @@ setMethod("shiftRight", signature(y = "Column", x = 
"numeric"),
 #' @rdname shiftRightUnsigned
 #' @name shiftRightUnsigned
 #' @export
+#' @examples \dontrun{shiftRightUnsigned(df$c, 1)}
 setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
           function(y, x) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1830,6 +1859,7 @@ setMethod("shiftRightUnsigned", signature(y = "Column", x 
= "numeric"),
 #' @rdname concat_ws
 #' @name concat_ws
 #' @export
+#' @examples \dontrun{concat_ws('-', df$s, df$d)}
 setMethod("concat_ws", signature(sep = "character", x = "Column"),
           function(sep, x, ...) {
             jcols <- lapply(list(x, ...), function(x) { x@jc })
@@ -1845,6 +1875,7 @@ setMethod("concat_ws", signature(sep = "character", x = 
"Column"),
 #' @rdname conv
 #' @name conv
 #' @export
+#' @examples \dontrun{conv(df$n, 2, 16)}
 setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = 
"numeric"),
           function(x, fromBase, toBase) {
             fromBase <- as.integer(fromBase)
@@ -1864,6 +1895,7 @@ setMethod("conv", signature(x = "Column", fromBase = 
"numeric", toBase = "numeri
 #' @rdname expr
 #' @name expr
 #' @export
+#' @examples \dontrun{expr('length(name)')}
 setMethod("expr", signature(x = "character"),
           function(x) {
             jc <- callJStatic("org.apache.spark.sql.functions", "expr", x)
@@ -1878,6 +1910,7 @@ setMethod("expr", signature(x = "character"),
 #' @rdname format_string
 #' @name format_string
 #' @export
+#' @examples \dontrun{format_string('%d %s', df$a, df$b)}
 setMethod("format_string", signature(format = "character", x = "Column"),
           function(format, x, ...) {
             jcols <- lapply(list(x, ...), function(arg) { arg@jc })
@@ -1897,6 +1930,11 @@ setMethod("format_string", signature(format = 
"character", x = "Column"),
 #' @rdname from_unixtime
 #' @name from_unixtime
 #' @export
+#' @examples
+#'\dontrun{
+#'from_unixtime(df$t)
+#'from_unixtime(df$t, 'yyyy/MM/dd HH')
+#'}
 setMethod("from_unixtime", signature(x = "Column"),
           function(x, format = "yyyy-MM-dd HH:mm:ss") {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1915,6 +1953,7 @@ setMethod("from_unixtime", signature(x = "Column"),
 #' @rdname locate
 #' @name locate
 #' @export
+#' @examples \dontrun{locate('b', df$c, 1)}
 setMethod("locate", signature(substr = "character", str = "Column"),
           function(substr, str, pos = 0) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1931,6 +1970,7 @@ setMethod("locate", signature(substr = "character", str = 
"Column"),
 #' @rdname lpad
 #' @name lpad
 #' @export
+#' @examples \dontrun{lpad(df$c, 6, '#')}
 setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
           function(x, len, pad) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -1947,12 +1987,13 @@ setMethod("lpad", signature(x = "Column", len = 
"numeric", pad = "character"),
 #' @rdname rand
 #' @name rand
 #' @export
+#' @examples \dontrun{rand()}
 setMethod("rand", signature(seed = "missing"),
           function(seed) {
             jc <- callJStatic("org.apache.spark.sql.functions", "rand")
             column(jc)
           })
-#' @family normal_funcs
+
 #' @rdname rand
 #' @name rand
 #' @export
@@ -1970,12 +2011,13 @@ setMethod("rand", signature(seed = "numeric"),
 #' @rdname randn
 #' @name randn
 #' @export
+#' @examples \dontrun{randn()}
 setMethod("randn", signature(seed = "missing"),
           function(seed) {
             jc <- callJStatic("org.apache.spark.sql.functions", "randn")
             column(jc)
           })
-#' @family normal_funcs
+
 #' @rdname randn
 #' @name randn
 #' @export
@@ -1993,6 +2035,7 @@ setMethod("randn", signature(seed = "numeric"),
 #' @rdname regexp_extract
 #' @name regexp_extract
 #' @export
+#' @examples \dontrun{regexp_extract(df$c, '(\d+)-(\d+)', 1)}
 setMethod("regexp_extract",
           signature(x = "Column", pattern = "character", idx = "numeric"),
           function(x, pattern, idx) {
@@ -2010,6 +2053,7 @@ setMethod("regexp_extract",
 #' @rdname regexp_replace
 #' @name regexp_replace
 #' @export
+#' @examples \dontrun{regexp_replace(df$c, '(\\d+)', '--')}
 setMethod("regexp_replace",
           signature(x = "Column", pattern = "character", replacement = 
"character"),
           function(x, pattern, replacement) {
@@ -2027,6 +2071,7 @@ setMethod("regexp_replace",
 #' @rdname rpad
 #' @name rpad
 #' @export
+#' @examples \dontrun{rpad(df$c, 6, '#')}
 setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
           function(x, len, pad) {
             jc <- callJStatic("org.apache.spark.sql.functions",
@@ -2040,12 +2085,17 @@ setMethod("rpad", signature(x = "Column", len = 
"numeric", pad = "character"),
 #' Returns the substring from string str before count occurrences of the 
delimiter delim.
 #' If count is positive, everything the left of the final delimiter (counting 
from left) is
 #' returned. If count is negative, every to the right of the final delimiter 
(counting from the
-#' right) is returned. substring <- index performs a case-sensitive match when 
searching for delim.
+#' right) is returned. substring_index performs a case-sensitive match when 
searching for delim.
 #'
 #' @family string_funcs
 #' @rdname substring_index
 #' @name substring_index
 #' @export
+#' @examples
+#'\dontrun{
+#'substring_index(df$c, '.', 2)
+#'substring_index(df$c, '.', -1)
+#'}
 setMethod("substring_index",
           signature(x = "Column", delim = "character", count = "numeric"),
           function(x, delim, count) {
@@ -2066,6 +2116,7 @@ setMethod("substring_index",
 #' @rdname translate
 #' @name translate
 #' @export
+#' @examples \dontrun{translate(df$c, 'rnlt', '123')}
 setMethod("translate",
           signature(x = "Column", matchingString = "character", replaceString 
= "character"),
           function(x, matchingString, replaceString) {
@@ -2082,12 +2133,18 @@ setMethod("translate",
 #' @rdname unix_timestamp
 #' @name unix_timestamp
 #' @export
+#' @examples
+#'\dontrun{
+#'unix_timestamp()
+#'unix_timestamp(df$t)
+#'unix_timestamp(df$t, 'yyyy-MM-dd HH')
+#'}
 setMethod("unix_timestamp", signature(x = "missing", format = "missing"),
           function(x, format) {
             jc <- callJStatic("org.apache.spark.sql.functions", 
"unix_timestamp")
             column(jc)
           })
-#' @family datetime_funcs
+
 #' @rdname unix_timestamp
 #' @name unix_timestamp
 #' @export
@@ -2096,7 +2153,7 @@ setMethod("unix_timestamp", signature(x = "Column", 
format = "missing"),
             jc <- callJStatic("org.apache.spark.sql.functions", 
"unix_timestamp", x@jc)
             column(jc)
           })
-#' @family datetime_funcs
+
 #' @rdname unix_timestamp
 #' @name unix_timestamp
 #' @export
@@ -2113,7 +2170,9 @@ setMethod("unix_timestamp", signature(x = "Column", 
format = "character"),
 #' @family normal_funcs
 #' @rdname when
 #' @name when
+#' @seealso \link{ifelse}
 #' @export
+#' @examples \dontrun{when(df$age == 2, df$age + 1)}
 setMethod("when", signature(condition = "Column", value = "ANY"),
           function(condition, value) {
               condition <- condition@jc
@@ -2130,7 +2189,9 @@ setMethod("when", signature(condition = "Column", value = 
"ANY"),
 #' @family normal_funcs
 #' @rdname ifelse
 #' @name ifelse
+#' @seealso \link{when}
 #' @export
+#' @examples \dontrun{ifelse(df$a > 1 & df$b > 2, 0, 1)}
 setMethod("ifelse",
           signature(test = "Column", yes = "ANY", no = "ANY"),
           function(test, yes, no) {

http://git-wip-us.apache.org/repos/asf/spark/blob/28e46ab4/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 0fbe065..899fc3b 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -880,14 +880,15 @@ test_that("column functions", {
   expect_equal(collect(df3)[[2, 1]], FALSE)
   expect_equal(collect(df3)[[3, 1]], TRUE)
 
-  expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
+  df4 <- select(df, countDistinct(df$age, df$name))
+  expect_equal(collect(df4)[[1, 1]], 2)
 
+  expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
   expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
-
   expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25)
 
-  df4 <- createDataFrame(sqlContext, list(list(a = "010101")))
-  expect_equal(collect(select(df4, conv(df4$a, 2, 16)))[1, 1], "15")
+  df5 <- createDataFrame(sqlContext, list(list(a = "010101")))
+  expect_equal(collect(select(df5, conv(df5$a, 2, 16)))[1, 1], "15")
 
   # Test array_contains() and sort_array()
   df <- createDataFrame(sqlContext, list(list(list(1L, 2L, 3L)), list(list(6L, 
5L, 4L))))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-12029][SPARKR] Improve column functions signature, param check, tests, fix doc and add examples

Reply via email to