paleolimbot commented on code in PR #13786:
URL: https://github.com/apache/arrow/pull/13786#discussion_r943592958


##########
r/R/dplyr-mutate.R:
##########
@@ -151,3 +153,95 @@ ensure_named_exprs <- function(exprs) {
   names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
   exprs
 }
+
+# Take the input quos and unfold any instances of across()
+# into individual quosures
+unfold_across <- function(.data, quos_in) {
+  quos_out <- list()
+  # Check for any expressions starting with across
+  for (quo_i in seq_along(quos_in)) {
+    quo_in <- quos_in[quo_i]
+    quo_expr <- quo_get_expr(quo_in[[1]])
+
+    if (is_call(quo_expr, "across")) {
+      new_quos <- list()
+      across_call <- match.call(dplyr::across, quo_expr)
+
+      if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
+        abort("`...` argument to `across()` is deprecated in dplyr and not 
supported in Arrow")
+      }
+
+      # ARROW-17364: add support for .names argument
+      if (!is.null(across_call[[".names"]])) {
+        abort("`.names` argument to `across()` not yet supported in Arrow")
+      }
+
+      # use select() to get the column names so we can take advantage of 
tidyselect
+      cols <- names(select(.data, !!across_call[[".cols"]]))
+      funcs <- as.character(across_call[[".fns"]])
+
+      # calling across() with .fns = NULL returns all columns unchanged
+      if (is_empty(funcs)) {
+        return()
+      }
+
+      if (funcs[[1]] == "~") {
+        abort(
+          paste(
+            "purrr-style lambda functions as `.fns` argument to `across()`",
+            "not yet supported in Arrow"
+          )
+        )
+      }
+
+      # if only 1 function, we overwrite the old columns with the new values
+      if (length(funcs) == 1) {
+        # work out the quosures from the call
+        col_syms <- syms(cols)
+        new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x)))
+        new_quos <- set_names(new_quos, cols)
+      } else {
+        # remove `c()` and `list()` which have been used to specify functions
+        extracted_funcs <- funcs[map_lgl(funcs, ~ !.x %in% c("c", "list"))]
+
+        func_list <- ensure_named_funcs(extracted_funcs)
+        new_quos <- quosures_from_func_list(func_list, cols)
+      }
+
+      quos_out <- append(quos_out, new_quos)

Review Comment:
   I'm still working my head around this but I think that *somewhere* you need 
to `rlang::quo_set_env(quo_out, rlang::quo_get_env(quo_in))` to make sure that 
symbol references that are *not* columns are fetched from the calling 
environment. I'm struggling to come up with an example where that can happen so 
maybe this isn't relevant here, but it seems like somehow 
`rlang::quo_get_env(quo_in))` should be passed on to the output quosures? 



##########
r/R/dplyr-mutate.R:
##########
@@ -151,3 +153,95 @@ ensure_named_exprs <- function(exprs) {
   names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
   exprs
 }
+
+# Take the input quos and unfold any instances of across()
+# into individual quosures
+unfold_across <- function(.data, quos_in) {
+  quos_out <- list()
+  # Check for any expressions starting with across
+  for (quo_i in seq_along(quos_in)) {
+    quo_in <- quos_in[quo_i]
+    quo_expr <- quo_get_expr(quo_in[[1]])
+
+    if (is_call(quo_expr, "across")) {
+      new_quos <- list()
+      across_call <- match.call(dplyr::across, quo_expr)
+
+      if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
+        abort("`...` argument to `across()` is deprecated in dplyr and not 
supported in Arrow")
+      }
+
+      # ARROW-17364: add support for .names argument
+      if (!is.null(across_call[[".names"]])) {
+        abort("`.names` argument to `across()` not yet supported in Arrow")
+      }
+
+      # use select() to get the column names so we can take advantage of 
tidyselect
+      cols <- names(select(.data, !!across_call[[".cols"]]))

Review Comment:
   ```suggestion
         cols <- names(dplyr::select(.data, !!across_call[[".cols"]]))
   ```



##########
r/R/dplyr-mutate.R:
##########
@@ -151,3 +153,95 @@ ensure_named_exprs <- function(exprs) {
   names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
   exprs
 }
+
+# Take the input quos and unfold any instances of across()
+# into individual quosures
+unfold_across <- function(.data, quos_in) {
+  quos_out <- list()
+  # Check for any expressions starting with across
+  for (quo_i in seq_along(quos_in)) {
+    quo_in <- quos_in[quo_i]
+    quo_expr <- quo_get_expr(quo_in[[1]])
+
+    if (is_call(quo_expr, "across")) {
+      new_quos <- list()
+      across_call <- match.call(dplyr::across, quo_expr)
+
+      if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
+        abort("`...` argument to `across()` is deprecated in dplyr and not 
supported in Arrow")
+      }
+
+      # ARROW-17364: add support for .names argument
+      if (!is.null(across_call[[".names"]])) {
+        abort("`.names` argument to `across()` not yet supported in Arrow")
+      }
+
+      # use select() to get the column names so we can take advantage of 
tidyselect
+      cols <- names(select(.data, !!across_call[[".cols"]]))
+      funcs <- as.character(across_call[[".fns"]])
+
+      # calling across() with .fns = NULL returns all columns unchanged
+      if (is_empty(funcs)) {
+        return()
+      }
+
+      if (funcs[[1]] == "~") {
+        abort(
+          paste(
+            "purrr-style lambda functions as `.fns` argument to `across()`",
+            "not yet supported in Arrow"
+          )
+        )
+      }
+
+      # if only 1 function, we overwrite the old columns with the new values
+      if (length(funcs) == 1) {
+        # work out the quosures from the call
+        col_syms <- syms(cols)
+        new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x)))
+        new_quos <- set_names(new_quos, cols)
+      } else {
+        # remove `c()` and `list()` which have been used to specify functions
+        extracted_funcs <- funcs[map_lgl(funcs, ~ !.x %in% c("c", "list"))]
+
+        func_list <- ensure_named_funcs(extracted_funcs)
+        new_quos <- quosures_from_func_list(func_list, cols)
+      }
+
+      quos_out <- append(quos_out, new_quos)
+    } else {
+      quos_out <- append(quos_out, quo_in)
+    }
+  }
+
+  quos_out
+}
+
+# if the function is unnamed (an empty character), use the index instead
+ensure_named_funcs <- function(funcs) {
+  func_list <- as.list(funcs)
+  func_names <- names(funcs) %||% rep("", length(funcs))
+  func_indices <- seq_along(funcs)
+  names(func_list) <- map2_chr(func_names, func_indices, max)
+  func_list
+}
+
+# given a named list of functions and column names, create a list of new 
quosures
+quosures_from_func_list <- function(func_list, cols) {
+  func_list_full <- rep(func_list, length(cols))
+  cols_list_full <- rep(cols, each = length(func_list))
+
+  # get names of new quosures
+  new_quo_names <- map2_chr(
+    names(func_list_full), cols_list_full,
+    ~ paste(.y, .x, sep = "_")
+  )

Review Comment:
   Can you implement the `.names` argument here? There's almost certainly a 
cleaner way to do this but something like:
   
   ``` r
   withr::with_environment(as.environment(list(.col = "something", .fn = 
"something-else")), glue::glue("{.col}_{.fn}"))
   #> something_something-else
   ```
   
   <sup>Created on 2022-08-11 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.1)</sup> 



##########
r/tests/testthat/test-dplyr-mutate.R:
##########
@@ -589,3 +588,106 @@ test_that("mutate() and transmute() with namespaced 
functions", {
     tbl
   )
 })
+
+test_that("Can use across() within mutate()", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(c(dbl, dbl2), round)) %>%
+      collect(),
+    tbl
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>%
+      collect(),
+    tbl
+  )
+
+  # across() arguments not in default order
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(.fns = round, c(dbl, dbl2))) %>%
+      collect(),
+    tbl
+  )
+
+  # ARROW-17364: .names argument not yet supported for across()
+  expect_error(
+    tbl %>%
+      arrow_table() %>%
+      mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>%
+      collect(),
+    regexp = "`.names` argument to `across()` not yet supported in Arrow",
+    fixed = TRUE
+  )
+
+  # ellipses (...) are a deprecated argument
+  expect_error(
+    tbl %>%
+      arrow_table() %>%
+      mutate(across(c(dbl, dbl2), round, digits = -1)) %>%
+      collect(),
+    regexp = "`...` argument to `across()` is deprecated in dplyr and not 
supported in Arrow",
+    fixed = TRUE
+  )
+
+  # alternative ways of specifying .fns - as a list
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(1:dbl2, list(round))) %>%
+      collect(),
+    tbl
+  )
+

Review Comment:
   It seems like this will work but what about `across(1:dbl2, list("fun1" = 
round, "fun2" = exp))`? Does putting something obnoxious like `across(1:dbl2, 
list("fun1" = round(this_is_not_cool(something_else)), "fun2" = exp))` result 
in an interpretable error?



##########
r/R/dplyr-mutate.R:
##########
@@ -24,7 +24,9 @@ mutate.arrow_dplyr_query <- function(.data,
                                      .before = NULL,
                                      .after = NULL) {
   call <- match.call()
-  exprs <- ensure_named_exprs(quos(...))
+
+  expression_list <- unfold_across(.data, quos(...))
+  exprs <- ensure_named_exprs(expression_list)

Review Comment:
   Should this also get copied to `filter()` (or maybe that's already another 
ticket?)



##########
r/tests/testthat/test-dplyr-mutate.R:
##########
@@ -589,3 +588,106 @@ test_that("mutate() and transmute() with namespaced 
functions", {
     tbl
   )
 })
+
+test_that("Can use across() within mutate()", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(c(dbl, dbl2), round)) %>%
+      collect(),
+    tbl
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>%
+      collect(),
+    tbl
+  )
+
+  # across() arguments not in default order
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(.fns = round, c(dbl, dbl2))) %>%
+      collect(),
+    tbl
+  )
+
+  # ARROW-17364: .names argument not yet supported for across()
+  expect_error(
+    tbl %>%
+      arrow_table() %>%
+      mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>%
+      collect(),
+    regexp = "`.names` argument to `across()` not yet supported in Arrow",
+    fixed = TRUE
+  )
+
+  # ellipses (...) are a deprecated argument
+  expect_error(
+    tbl %>%
+      arrow_table() %>%
+      mutate(across(c(dbl, dbl2), round, digits = -1)) %>%
+      collect(),
+    regexp = "`...` argument to `across()` is deprecated in dplyr and not 
supported in Arrow",
+    fixed = TRUE
+  )
+
+  # alternative ways of specifying .fns - as a list
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(1:dbl2, list(round))) %>%
+      collect(),
+    tbl
+  )
+
+  # supply .fns as a one-item vector
+  compare_dplyr_binding(
+    .input %>%
+      mutate(across(1:dbl2, c(round))) %>%
+      collect(),
+    tbl
+  )
+
+  # ARROW-17366: purrr-style lmabda functions not yet supported

Review Comment:
   ```suggestion
     # ARROW-17366: purrr-style lambda functions not yet supported
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to