This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch maint-10.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 0f4ba5360da4eca99830e2b11c2ee9c549039b52
Author: Neal Richardson <[email protected]>
AuthorDate: Sat Oct 22 08:49:18 2022 -0400

    ARROW-18132: [R] Add deprecation cycle for pull() change (#14475)
    
    Authored-by: Neal Richardson <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/NEWS.md                           | 13 ++++++-----
 r/R/arrow-package.R                 |  8 ++++++-
 r/R/dplyr-collect.R                 | 45 ++++++++++++++++++++++++++++++++-----
 r/R/dplyr-funcs-doc.R               |  2 +-
 r/R/dplyr-group-by.R                |  5 ++++-
 r/man/acero.Rd                      |  2 +-
 r/man/cast.Rd                       |  2 +-
 r/tests/testthat/helper-arrow.R     |  4 ++++
 r/tests/testthat/test-dplyr-query.R | 17 +++++++++++---
 9 files changed, 80 insertions(+), 18 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 11d3df1e88..01ce44ce1f 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -45,11 +45,14 @@ A few new features and bugfixes were implemented for joins:
   join keys (when `keep = FALSE`), avoiding the issue where the join keys would
   be all `NA` for rows in the right hand side without any matches on the left.
 
-A few breaking changes that improve the consistency of the API:
-
-* Calling `dplyr::pull()` will return a `?ChunkedArray` instead of an R vector.
-* Calling `dplyr::compute()` on a query that is grouped
-  returns a `?Table`, instead of a query object.
+Some changes to improve the consistency of the API:
+
+* In a future release, calling `dplyr::pull()` will return a `?ChunkedArray`
+  instead of an R vector by default. The current default behavior is 
deprecated.
+  To update to the new behavior now, specify `pull(as_vector = FALSE)` or set
+  `options(arrow.pull_as_vector = FALSE)` globally.
+* Calling `dplyr::compute()` on a query that is grouped returns a `?Table`
+  instead of a query object.
 
 Finally, long-running queries can now be cancelled and will abort their
 computation immediately.
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 1ab4e41a7a..aca593551f 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -54,7 +54,13 @@ supported_dplyr_methods <- list(
   transmute = NULL,
   arrange = NULL,
   rename = NULL,
-  pull = "returns an Arrow [ChunkedArray], not an R vector",
+  pull = c(
+    "the `name` argument is not supported;",
+    "returns an R vector by default but this behavior is deprecated and will",
+    "return an Arrow [ChunkedArray] in a future release. Provide",
+    "`as_vector = TRUE/FALSE` to control this behavior, or set",
+    "`options(arrow.pull_as_vector)` globally."
+  ),
   relocate = NULL,
   compute = NULL,
   collapse = NULL,
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index 4f8ffc7c1a..8bf22728d6 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -46,16 +46,51 @@ compute.arrow_dplyr_query <- function(x, ...) 
dplyr::collect(x, as_data_frame =
 compute.ArrowTabular <- function(x, ...) x
 compute.Dataset <- compute.RecordBatchReader <- compute.arrow_dplyr_query
 
-pull.arrow_dplyr_query <- function(.data, var = -1) {
+pull.Dataset <- function(.data,
+                         var = -1,
+                         ...,
+                         as_vector = getOption("arrow.pull_as_vector")) {
   .data <- as_adq(.data)
   var <- vars_pull(names(.data), !!enquo(var))
   .data$selected_columns <- set_names(.data$selected_columns[var], var)
-  dplyr::compute(.data)[[1]]
+  out <- dplyr::compute(.data)[[1]]
+  handle_pull_as_vector(out, as_vector)
+}
+pull.RecordBatchReader <- pull.arrow_dplyr_query <- pull.Dataset
+
+pull.ArrowTabular <- function(x,
+                              var = -1,
+                              ...,
+                              as_vector = getOption("arrow.pull_as_vector")) {
+  out <- x[[vars_pull(names(x), !!enquo(var))]]
+  handle_pull_as_vector(out, as_vector)
 }
-pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query
 
-pull.ArrowTabular <- function(x, var = -1) {
-  x[[vars_pull(names(x), !!enquo(var))]]
+handle_pull_as_vector <- function(out, as_vector) {
+  if (is.null(as_vector)) {
+    warn(
+      c(
+        paste(
+          "Default behavior of `pull()` on Arrow data is changing. Current",
+          "behavior of returning an R vector is deprecated, and in a future",
+          "release, it will return an Arrow `ChunkedArray`. To control this:"
+        ),
+        i = paste(
+          "Specify `as_vector = TRUE` (the current default) or",
+          "`FALSE` (what it will change to) in `pull()`"
+        ),
+        i = "Or, set `options(arrow.pull_as_vector)` globally"
+      ),
+      .frequency = "regularly",
+      .frequency_id = "arrow.pull_as_vector",
+      class = "lifecycle_warning_deprecated"
+    )
+    as_vector <- TRUE
+  }
+  if (as_vector) {
+    out <- as.vector(out)
+  }
+  out
 }
 
 restore_dplyr_features <- function(df, query) {
diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R
index eb0f582201..b8337e3069 100644
--- a/r/R/dplyr-funcs-doc.R
+++ b/r/R/dplyr-funcs-doc.R
@@ -54,7 +54,7 @@
 #' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` 
arguments are ignored
 #' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` 
arguments are ignored
 #' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require 
aggregation within groups) not currently supported
-#' * [`pull()`][dplyr::pull()]: returns an Arrow [ChunkedArray], not an R 
vector
+#' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns 
an R vector by default but this behavior is deprecated and will return an Arrow 
[ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control 
this behavior, or set `options(arrow.pull_as_vector)` globally.
 #' * [`relocate()`][dplyr::relocate()]
 #' * [`rename()`][dplyr::rename()]
 #' * [`rename_with()`][dplyr::rename_with()]
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 57cf417c9a..85825b9bf2 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -25,7 +25,10 @@ group_by.arrow_dplyr_query <- function(.data,
                                        .drop = 
dplyr::group_by_drop_default(.data)) {
   if (!missing(add)) {
     .Deprecated(
-      msg = paste("The `add` argument of `group_by()` is deprecated. Please 
use the `.add` argument instead.")
+      msg = paste(
+        "The `add` argument of `group_by()` is deprecated.",
+        "Please use the `.add` argument instead."
+      )
     )
     .add <- add
   }
diff --git a/r/man/acero.Rd b/r/man/acero.Rd
index d340c2cbd8..84adf081de 100644
--- a/r/man/acero.Rd
+++ b/r/man/acero.Rd
@@ -38,7 +38,7 @@ Table into an R \code{data.frame}.
 \item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} and 
\code{na_matches} arguments are ignored
 \item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} and 
\code{na_matches} arguments are ignored
 \item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that 
require aggregation within groups) not currently supported
-\item \code{\link[dplyr:pull]{pull()}}: returns an Arrow \link{ChunkedArray}, 
not an R vector
+\item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not 
supported; returns an R vector by default but this behavior is deprecated and 
will return an Arrow \link{ChunkedArray} in a future release. Provide 
\code{as_vector = TRUE/FALSE} to control this behavior, or set 
\code{options(arrow.pull_as_vector)} globally.
 \item \code{\link[dplyr:relocate]{relocate()}}
 \item \code{\link[dplyr:rename]{rename()}}
 \item \code{\link[dplyr:rename]{rename_with()}}
diff --git a/r/man/cast.Rd b/r/man/cast.Rd
index 6d87958376..81e729c704 100644
--- a/r/man/cast.Rd
+++ b/r/man/cast.Rd
@@ -34,7 +34,7 @@ mtcars \%>\%
 \seealso{
 \code{\link{data-type}} for a list of \link{DataType} to be used with 
\code{to}.
 
-\href{https://arrow.apache.org/docs/cpp/api/compute.html?highlight=castoptions#arrow\%3A\%3Acompute\%3A\%3ACastOptions}{Arrow
 C++ CastOptions documentation}
+\href{https://arrow.apache.org/docs/cpp/api/compute.html?highlight=castoptions#arrow\%3A\%3Acompute\%3A\%3ACastOptions}{Arrow
 C++ CastOptions documentation} # nolint
 for the list of supported CastOptions.
 }
 \keyword{internal}
diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R
index d705a8029c..6812a3eec0 100644
--- a/r/tests/testthat/helper-arrow.R
+++ b/r/tests/testthat/helper-arrow.R
@@ -29,6 +29,10 @@ Sys.setlocale("LC_COLLATE", "C")
 # (R CMD check does this, but in case you're running outside of check)
 Sys.setenv(LANGUAGE = "en")
 
+# Set this option so that the deprecation warning isn't shown
+# (except when we test for it)
+options(arrow.pull_as_vector = FALSE)
+
 with_language <- function(lang, expr) {
   old <- Sys.getenv("LANGUAGE")
   # Check what this message is before changing languages; this will
diff --git a/r/tests/testthat/test-dplyr-query.R 
b/r/tests/testthat/test-dplyr-query.R
index db9a3bb30d..ef9a9bcdc1 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -91,6 +91,17 @@ test_that("pull", {
   )
 })
 
+test_that("pull() shows a deprecation warning if the option isn't set", {
+  expect_warning(
+    vec <- tbl %>%
+      arrow_table() %>%
+      pull(as_vector = NULL),
+    "Current behavior of returning an R vector is deprecated"
+  )
+  # And the default is the old behavior, an R vector
+  expect_identical(vec, pull(tbl))
+})
+
 test_that("collect(as_data_frame=FALSE)", {
   batch <- record_batch(tbl)
 
@@ -583,9 +594,9 @@ test_that("needs_projection unit tests", {
 
 test_that("compute() on a grouped query returns a Table with groups in 
metadata", {
   tab1 <- tbl %>%
-      arrow_table() %>%
-      group_by(int) %>%
-      compute()
+    arrow_table() %>%
+    group_by(int) %>%
+    compute()
   expect_r6_class(tab1, "Table")
   expect_equal(
     as.data.frame(tab1),

Reply via email to