[spark] branch branch-3.1 updated: [SPARK-26199][SPARK-31517][R] Fix strategy for handling ... names in mutate

gurwls223 Thu, 17 Dec 2020 03:43:53 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new b557147  [SPARK-26199][SPARK-31517][R] Fix strategy for handling ... 
names in mutate
b557147 is described below

commit b5571476a5171495770a556b0b1da044263d0ebe
Author: Michael Chirico <michael.chir...@grabtaxi.com>
AuthorDate: Thu Dec 17 17:20:45 2020 +0900

    [SPARK-26199][SPARK-31517][R] Fix strategy for handling ... names in mutate
    
    ### What changes were proposed in this pull request?
    
    Change the strategy for how the varargs are handled in the default `mutate` 
method
    
    ### Why are the changes needed?
    
    Bugfix -- `deparse` + `sapply` not working as intended due to `width.cutoff`
    
    ### Does this PR introduce any user-facing change?
    
    Yes, bugfix. Shouldn't change any working code.
    
    ### How was this patch tested?
    
    None! yet.
    
    Closes #28386 from MichaelChirico/r-mutate-deparse.
    
    Lead-authored-by: Michael Chirico <michael.chir...@grabtaxi.com>
    Co-authored-by: Michael Chirico <michaelchiri...@gmail.com>
    Signed-off-by: HyukjinKwon <gurwls...@apache.org>
---
 R/pkg/R/DataFrame.R                   | 18 ++++++++++--------
 R/pkg/tests/fulltests/test_sparkSQL.R | 15 +++++++++++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 31a651e..8ca338f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2277,16 +2277,17 @@ setMethod("mutate",
 
             # For named arguments, use the names for arguments as the column 
names
             # For unnamed arguments, use the argument symbols as the column 
names
-            args <- sapply(substitute(list(...))[-1], deparse)
             ns <- names(cols)
-            if (!is.null(ns)) {
-              lapply(seq_along(args), function(i) {
-                if (ns[[i]] != "") {
-                  args[[i]] <<- ns[[i]]
-                }
+            if (is.null(ns)) ns <- rep("", length(cols))
+            named_idx <- nzchar(ns)
+            if (!all(named_idx)) {
+              # SPARK-31517: deparse uses width.cutoff on wide input and the
+              #   output is length>1, so need to collapse it to scalar
+              colsub <- substitute(list(...))[-1L]
+              ns[!named_idx] <- sapply(which(!named_idx), function(ii) {
+                paste(gsub("^\\s*|\\s*$", "", deparse(colsub[[ii]])), collapse 
= " ")
               })
             }
-            ns <- args
 
             # The last column of the same name in the specific columns takes 
effect
             deDupCols <- list()
@@ -3444,7 +3445,8 @@ setMethod("as.data.frame",
 #' @note attach since 1.6.0
 setMethod("attach",
           signature(what = "SparkDataFrame"),
-          function(what, pos = 2L, name = deparse(substitute(what), backtick = 
FALSE),
+          function(what, pos = 2L,
+                   name = paste(deparse(substitute(what), backtick = FALSE), 
collapse = " "),
                    warn.conflicts = TRUE) {
             args <- as.list(environment()) # capture all parameters - this 
must be the first line
             newEnv <- assignNewEnv(args$what)
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
index c623f53..ebf08b9 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2884,6 +2884,15 @@ test_that("mutate(), transform(), rename() and names()", 
{
   expect_equal(nrow(result), 153)
   expect_equal(ncol(result), 2)
   detach(airquality)
+
+  # ensure long inferred names are handled without error (SPARK-26199)
+  #   test implicitly assumes eval(formals(deparse)$width.cutoff) = 60
+  #   (which has always been true as of 2020-11-15)
+  newDF <- mutate(
+    df,
+    df$age + 12345678901234567890 + 12345678901234567890 + 12345678901234
+  )
+  expect_match(tail(columns(newDF), 1L), "234567890", fixed = TRUE)
 })
 
 test_that("read/write ORC files", {
@@ -3273,6 +3282,12 @@ test_that("attach() on a DataFrame", {
   stat3 <- summary(df[, "age", drop = F])
   expect_equal(collect(stat3)[8, "age"], "30")
   expect_error(age)
+
+  # attach method uses deparse(); ensure no errors from a very long input
+  abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnop <- df # 
nolint
+  attach(abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnop)
+  expect_true(any(grepl("abcdefghijklmnopqrstuvwxyz", search())))
+  
detach("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnop")
 })
 
 test_that("with() on a DataFrame", {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.1 updated: [SPARK-26199][SPARK-31517][R] Fix strategy for handling ... names in mutate

Reply via email to