paleolimbot commented on code in PR #13786:
URL: https://github.com/apache/arrow/pull/13786#discussion_r943592958
##########
r/R/dplyr-mutate.R:
##########
@@ -151,3 +153,95 @@ ensure_named_exprs <- function(exprs) {
names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
exprs
}
+
+# Take the input quos and unfold any instances of across()
+# into individual quosures
+unfold_across <- function(.data, quos_in) {
+ quos_out <- list()
+ # Check for any expressions starting with across
+ for (quo_i in seq_along(quos_in)) {
+ quo_in <- quos_in[quo_i]
+ quo_expr <- quo_get_expr(quo_in[[1]])
+
+ if (is_call(quo_expr, "across")) {
+ new_quos <- list()
+ across_call <- match.call(dplyr::across, quo_expr)
+
+ if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
+ abort("`...` argument to `across()` is deprecated in dplyr and not
supported in Arrow")
+ }
+
+ # ARROW-17364: add support for .names argument
+ if (!is.null(across_call[[".names"]])) {
+ abort("`.names` argument to `across()` not yet supported in Arrow")
+ }
+
+ # use select() to get the column names so we can take advantage of
tidyselect
+ cols <- names(select(.data, !!across_call[[".cols"]]))
+ funcs <- as.character(across_call[[".fns"]])
+
+ # calling across() with .fns = NULL returns all columns unchanged
+ if (is_empty(funcs)) {
+ return()
+ }
+
+ if (funcs[[1]] == "~") {
+ abort(
+ paste(
+ "purrr-style lambda functions as `.fns` argument to `across()`",
+ "not yet supported in Arrow"
+ )
+ )
+ }
+
+ # if only 1 function, we overwrite the old columns with the new values
+ if (length(funcs) == 1) {
+ # work out the quosures from the call
+ col_syms <- syms(cols)
+ new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x)))
+ new_quos <- set_names(new_quos, cols)
+ } else {
+ # remove `c()` and `list()` which have been used to specify functions
+ extracted_funcs <- funcs[map_lgl(funcs, ~ !.x %in% c("c", "list"))]
+
+ func_list <- ensure_named_funcs(extracted_funcs)
+ new_quos <- quosures_from_func_list(func_list, cols)
+ }
+
+ quos_out <- append(quos_out, new_quos)
Review Comment:
I'm still working my head around this but I think that *somewhere* you need
to `rlang::quo_set_env(quo_out, rlang::quo_get_env(quo_in))` to make sure that
symbol references that are *not* columns are fetched from the calling
environment. I'm struggling to come up with an example where that can happen so
maybe this isn't relevant here, but it seems like somehow
`rlang::quo_get_env(quo_in))` should be passed on to the output quosures?
##########
r/R/dplyr-mutate.R:
##########
@@ -151,3 +153,95 @@ ensure_named_exprs <- function(exprs) {
names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
exprs
}
+
+# Take the input quos and unfold any instances of across()
+# into individual quosures
+unfold_across <- function(.data, quos_in) {
+ quos_out <- list()
+ # Check for any expressions starting with across
+ for (quo_i in seq_along(quos_in)) {
+ quo_in <- quos_in[quo_i]
+ quo_expr <- quo_get_expr(quo_in[[1]])
+
+ if (is_call(quo_expr, "across")) {
+ new_quos <- list()
+ across_call <- match.call(dplyr::across, quo_expr)
+
+ if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
+ abort("`...` argument to `across()` is deprecated in dplyr and not
supported in Arrow")
+ }
+
+ # ARROW-17364: add support for .names argument
+ if (!is.null(across_call[[".names"]])) {
+ abort("`.names` argument to `across()` not yet supported in Arrow")
+ }
+
+ # use select() to get the column names so we can take advantage of
tidyselect
+ cols <- names(select(.data, !!across_call[[".cols"]]))
Review Comment:
```suggestion
cols <- names(dplyr::select(.data, !!across_call[[".cols"]]))
```
##########
r/R/dplyr-mutate.R:
##########
@@ -151,3 +153,95 @@ ensure_named_exprs <- function(exprs) {
names(exprs)[unnamed] <- map_chr(exprs[unnamed], format_expr)
exprs
}
+
+# Take the input quos and unfold any instances of across()
+# into individual quosures
+unfold_across <- function(.data, quos_in) {
+ quos_out <- list()
+ # Check for any expressions starting with across
+ for (quo_i in seq_along(quos_in)) {
+ quo_in <- quos_in[quo_i]
+ quo_expr <- quo_get_expr(quo_in[[1]])
+
+ if (is_call(quo_expr, "across")) {
+ new_quos <- list()
+ across_call <- match.call(dplyr::across, quo_expr)
+
+ if (!all(names(across_call[-1]) %in% c(".cols", ".fns", ".names"))) {
+ abort("`...` argument to `across()` is deprecated in dplyr and not
supported in Arrow")
+ }
+
+ # ARROW-17364: add support for .names argument
+ if (!is.null(across_call[[".names"]])) {
+ abort("`.names` argument to `across()` not yet supported in Arrow")
+ }
+
+ # use select() to get the column names so we can take advantage of
tidyselect
+ cols <- names(select(.data, !!across_call[[".cols"]]))
+ funcs <- as.character(across_call[[".fns"]])
+
+ # calling across() with .fns = NULL returns all columns unchanged
+ if (is_empty(funcs)) {
+ return()
+ }
+
+ if (funcs[[1]] == "~") {
+ abort(
+ paste(
+ "purrr-style lambda functions as `.fns` argument to `across()`",
+ "not yet supported in Arrow"
+ )
+ )
+ }
+
+ # if only 1 function, we overwrite the old columns with the new values
+ if (length(funcs) == 1) {
+ # work out the quosures from the call
+ col_syms <- syms(cols)
+ new_quos <- map(col_syms, ~ quo(!!call2(funcs, .x)))
+ new_quos <- set_names(new_quos, cols)
+ } else {
+ # remove `c()` and `list()` which have been used to specify functions
+ extracted_funcs <- funcs[map_lgl(funcs, ~ !.x %in% c("c", "list"))]
+
+ func_list <- ensure_named_funcs(extracted_funcs)
+ new_quos <- quosures_from_func_list(func_list, cols)
+ }
+
+ quos_out <- append(quos_out, new_quos)
+ } else {
+ quos_out <- append(quos_out, quo_in)
+ }
+ }
+
+ quos_out
+}
+
+# if the function is unnamed (an empty character), use the index instead
+ensure_named_funcs <- function(funcs) {
+ func_list <- as.list(funcs)
+ func_names <- names(funcs) %||% rep("", length(funcs))
+ func_indices <- seq_along(funcs)
+ names(func_list) <- map2_chr(func_names, func_indices, max)
+ func_list
+}
+
+# given a named list of functions and column names, create a list of new
quosures
+quosures_from_func_list <- function(func_list, cols) {
+ func_list_full <- rep(func_list, length(cols))
+ cols_list_full <- rep(cols, each = length(func_list))
+
+ # get names of new quosures
+ new_quo_names <- map2_chr(
+ names(func_list_full), cols_list_full,
+ ~ paste(.y, .x, sep = "_")
+ )
Review Comment:
Can you implement the `.names` argument here? There's almost certainly a
cleaner way to do this but something like:
``` r
withr::with_environment(as.environment(list(.col = "something", .fn =
"something-else")), glue::glue("{.col}_{.fn}"))
#> something_something-else
```
<sup>Created on 2022-08-11 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
##########
r/tests/testthat/test-dplyr-mutate.R:
##########
@@ -589,3 +588,106 @@ test_that("mutate() and transmute() with namespaced
functions", {
tbl
)
})
+
+test_that("Can use across() within mutate()", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(c(dbl, dbl2), round)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>%
+ collect(),
+ tbl
+ )
+
+ # across() arguments not in default order
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(.fns = round, c(dbl, dbl2))) %>%
+ collect(),
+ tbl
+ )
+
+ # ARROW-17364: .names argument not yet supported for across()
+ expect_error(
+ tbl %>%
+ arrow_table() %>%
+ mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>%
+ collect(),
+ regexp = "`.names` argument to `across()` not yet supported in Arrow",
+ fixed = TRUE
+ )
+
+ # ellipses (...) are a deprecated argument
+ expect_error(
+ tbl %>%
+ arrow_table() %>%
+ mutate(across(c(dbl, dbl2), round, digits = -1)) %>%
+ collect(),
+ regexp = "`...` argument to `across()` is deprecated in dplyr and not
supported in Arrow",
+ fixed = TRUE
+ )
+
+ # alternative ways of specifying .fns - as a list
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(1:dbl2, list(round))) %>%
+ collect(),
+ tbl
+ )
+
Review Comment:
It seems like this will work but what about `across(1:dbl2, list("fun1" =
round, "fun2" = exp))`? Does putting something obnoxious like `across(1:dbl2,
list("fun1" = round(this_is_not_cool(something_else)), "fun2" = exp))` result
in an interpretable error?
##########
r/R/dplyr-mutate.R:
##########
@@ -24,7 +24,9 @@ mutate.arrow_dplyr_query <- function(.data,
.before = NULL,
.after = NULL) {
call <- match.call()
- exprs <- ensure_named_exprs(quos(...))
+
+ expression_list <- unfold_across(.data, quos(...))
+ exprs <- ensure_named_exprs(expression_list)
Review Comment:
Should this also get copied to `filter()` (or maybe that's already another
ticket?)
##########
r/tests/testthat/test-dplyr-mutate.R:
##########
@@ -589,3 +588,106 @@ test_that("mutate() and transmute() with namespaced
functions", {
tbl
)
})
+
+test_that("Can use across() within mutate()", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(c(dbl, dbl2), round)) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(c(dbl, dbl2), list(exp, sqrt))) %>%
+ collect(),
+ tbl
+ )
+
+ # across() arguments not in default order
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(.fns = round, c(dbl, dbl2))) %>%
+ collect(),
+ tbl
+ )
+
+ # ARROW-17364: .names argument not yet supported for across()
+ expect_error(
+ tbl %>%
+ arrow_table() %>%
+ mutate(across(c(dbl, dbl2), round, .names = "{.col}.{.fn}")) %>%
+ collect(),
+ regexp = "`.names` argument to `across()` not yet supported in Arrow",
+ fixed = TRUE
+ )
+
+ # ellipses (...) are a deprecated argument
+ expect_error(
+ tbl %>%
+ arrow_table() %>%
+ mutate(across(c(dbl, dbl2), round, digits = -1)) %>%
+ collect(),
+ regexp = "`...` argument to `across()` is deprecated in dplyr and not
supported in Arrow",
+ fixed = TRUE
+ )
+
+ # alternative ways of specifying .fns - as a list
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(1:dbl2, list(round))) %>%
+ collect(),
+ tbl
+ )
+
+ # supply .fns as a one-item vector
+ compare_dplyr_binding(
+ .input %>%
+ mutate(across(1:dbl2, c(round))) %>%
+ collect(),
+ tbl
+ )
+
+ # ARROW-17366: purrr-style lmabda functions not yet supported
Review Comment:
```suggestion
# ARROW-17366: purrr-style lambda functions not yet supported
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]