[arrow] branch master updated: GH-14947 [R] Compatibility with dplyr 1.1.0 (#14948)

paleolimbot Mon, 26 Dec 2022 10:18:36 -0800

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new be96d2059c GH-14947 [R] Compatibility with dplyr 1.1.0 (#14948)
be96d2059c is described below

commit be96d2059cebd26ec69e5830fc4a9307029a4006
Author: Lionel Henry <[email protected]>
AuthorDate: Mon Dec 26 19:18:21 2022 +0100

    GH-14947 [R] Compatibility with dplyr 1.1.0 (#14948)
    
    Each commit in this PR fixes a breaking/behaviour change described in 
#14947.
    
    I could not build the dev version of arrow so this is only tested against 
the CRAN version. Also there might be changes introduced since the last release 
that require compatibility adjustments. Could someone pick this up from here 
please?
    
    Closes #14947.
    * Closes: #14947
    
    Lead-authored-by: Lionel Henry <[email protected]>
    Co-authored-by: Dewey Dunnington <[email protected]>
    Co-authored-by: Dewey Dunnington <[email protected]>
    Signed-off-by: Dewey Dunnington <[email protected]>
---
 r/R/dplyr-distinct.R                       |  3 ---
 r/R/dplyr-funcs-simple.R                   | 25 +++++++++++++++++++++---
 r/R/dplyr-join.R                           | 10 ++++------
 r/R/enums.R                                |  7 +++++++
 r/tests/testthat/_snaps/dplyr-query.md     |  4 ++++
 r/tests/testthat/_snaps/dplyr-summarize.md | 13 +++++++++++++
 r/tests/testthat/test-dataset-dplyr.R      |  2 +-
 r/tests/testthat/test-dataset.R            | 19 +++++++++++++-----
 r/tests/testthat/test-dplyr-distinct.R     | 12 ++++++++++++
 r/tests/testthat/test-dplyr-filter.R       | 18 -----------------
 r/tests/testthat/test-dplyr-group-by.R     |  2 +-
 r/tests/testthat/test-dplyr-join.R         | 31 ++++++++++++++----------------
 r/tests/testthat/test-dplyr-query.R        | 16 +++------------
 r/tests/testthat/test-dplyr-slice.R        |  4 ++--
 r/tests/testthat/test-dplyr-summarize.R    | 25 ++++++++++++------------
 15 files changed, 110 insertions(+), 81 deletions(-)

diff --git a/r/R/dplyr-distinct.R b/r/R/dplyr-distinct.R
index c663d84e65..49948caa01 100644
--- a/r/R/dplyr-distinct.R
+++ b/r/R/dplyr-distinct.R
@@ -28,9 +28,6 @@ distinct.arrow_dplyr_query <- function(.data, ..., .keep_all 
= FALSE) {
   if (length(quos(...))) {
     # group_by() calls mutate() if there are any expressions in ...
     .data <- dplyr::group_by(.data, ..., .add = TRUE)
-    # `data %>% group_by() %>% summarise()` returns cols in order supplied
-    # but distinct() returns cols in dataset order, so sort group vars
-    .data$group_by_vars <- names(.data)[names(.data) %in% .data$group_by_vars]
   } else {
     # distinct() with no vars specified means distinct across all cols
     .data <- dplyr::group_by(.data, !!!syms(names(.data)))
diff --git a/r/R/dplyr-funcs-simple.R b/r/R/dplyr-funcs-simple.R
index 64aba4e5eb..308a46601a 100644
--- a/r/R/dplyr-funcs-simple.R
+++ b/r/R/dplyr-funcs-simple.R
@@ -143,7 +143,7 @@ cast_scalars_to_common_type <- function(args) {
   args[!is_expr] <- lapply(args[!is_expr], Scalar$create)
 
   if (any(is_expr)) {
-    try(
+    tryCatch(
       {
         # If the Expression has no Schema embedded, we cannot resolve its
         # type here, so this will error, hence the try() wrapping it
@@ -154,7 +154,14 @@ cast_scalars_to_common_type <- function(args) {
         # we'll just keep the original
         args[!is_expr] <- lapply(args[!is_expr], cast_or_parse, type = to_type)
       },
-      silent = TRUE
+      error = function(e) {
+        # We do want to error for some types of casting errors
+        if (inherits(e, "arrow_error_implicit_cast")) {
+          abort("Cast error", parent = e)
+        }
+
+        # Other cast errors we ignore and let Arrow handle at collect()
+      }
     )
   }
 
@@ -189,7 +196,10 @@ cast_or_parse <- function(x, type) {
   }
 
   # For most types, just cast.
-  # But for string -> date/time, we need to call a parsing function
+  # For string -> date/time, we need to call a parsing function.
+  # In dplyr 1.1.0, vctrs::vec_ptype2() rules are used, which are mostly
+  # the same as Arrow rules except that implicit conversion from string to
+  # numeric is no longer supported.
   if (x$type_id() %in% c(Type[["STRING"]], Type[["LARGE_STRING"]])) {
     if (to_type_id %in% c(Type[["DATE32"]], Type[["DATE64"]])) {
       x <- call_function(
@@ -211,6 +221,15 @@ cast_or_parse <- function(x, type) {
         x,
         options = list(timezone = Sys.timezone())
       )
+    } else if (to_type_id %in% unlist(TYPES_NUMERIC)) {
+      abort(
+        sprintf(
+          "Implicit cast from %s to %s is not supported",
+          x$type$ToString(),
+          type$ToString()
+        ),
+        class = "arrow_error_implicit_cast"
+      )
     }
   }
   x$cast(type)
diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R
index cb27c478b6..5e2b6084b2 100644
--- a/r/R/dplyr-join.R
+++ b/r/R/dplyr-join.R
@@ -112,9 +112,8 @@ semi_join.arrow_dplyr_query <- function(x,
                                         by = NULL,
                                         copy = FALSE,
                                         suffix = c(".x", ".y"),
-                                        ...,
-                                        keep = FALSE) {
-  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_SEMI")
+                                        ...) {
+  do_join(x, y, by, copy, suffix, ..., join_type = "LEFT_SEMI")
 }
 semi_join.Dataset <- semi_join.ArrowTabular <- semi_join.RecordBatchReader <- 
semi_join.arrow_dplyr_query
 
@@ -123,9 +122,8 @@ anti_join.arrow_dplyr_query <- function(x,
                                         by = NULL,
                                         copy = FALSE,
                                         suffix = c(".x", ".y"),
-                                        ...,
-                                        keep = FALSE) {
-  do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_ANTI")
+                                        ...) {
+  do_join(x, y, by, copy, suffix, ..., join_type = "LEFT_ANTI")
 }
 anti_join.Dataset <- anti_join.ArrowTabular <- anti_join.RecordBatchReader <- 
anti_join.arrow_dplyr_query
 
diff --git a/r/R/enums.R b/r/R/enums.R
index 727ca9388c..533ebc6c1d 100644
--- a/r/R/enums.R
+++ b/r/R/enums.R
@@ -82,6 +82,13 @@ Type <- enum("Type::type",
 )
 
 TYPES_WITH_NAN <- Type[c("HALF_FLOAT", "FLOAT", "DOUBLE")]
+TYPES_NUMERIC <- Type[
+  c(
+    "INT8", "UINT8", "INT16", "UINT16", "INT32", "UINT32",
+    "INT64", "UINT64", "HALF_FLOAT", "FLOAT", "DOUBLE",
+    "DECIMAL128", "DECIMAL256"
+    )
+  ]
 
 #' @rdname enums
 #' @export
diff --git a/r/tests/testthat/_snaps/dplyr-query.md 
b/r/tests/testthat/_snaps/dplyr-query.md
new file mode 100644
index 0000000000..a9d4da26cc
--- /dev/null
+++ b/r/tests/testthat/_snaps/dplyr-query.md
@@ -0,0 +1,4 @@
+# Scalars in expressions match the type of the field, if possible
+
+    Expression int == "5" not supported in Arrow; pulling data into R
+
diff --git a/r/tests/testthat/_snaps/dplyr-summarize.md 
b/r/tests/testthat/_snaps/dplyr-summarize.md
new file mode 100644
index 0000000000..bbb8e64bfe
--- /dev/null
+++ b/r/tests/testthat/_snaps/dplyr-summarize.md
@@ -0,0 +1,13 @@
+# Functions that take ... but we only accept a single arg
+
+    Code
+      InMemoryDataset$create(tbl) %>% summarize(distinct = n_distinct())
+    Condition
+      Error:
+      ! Error : In n_distinct(), n_distinct() with 0 arguments not supported 
in Arrow
+      Call collect() first to pull data into R.
+
+---
+
+    Error : In n_distinct(int, lgl), Multiple arguments to n_distinct() not 
supported in Arrow; pulling data into R
+
diff --git a/r/tests/testthat/test-dataset-dplyr.R 
b/r/tests/testthat/test-dataset-dplyr.R
index 44e038586e..c8054b0c83 100644
--- a/r/tests/testthat/test-dataset-dplyr.R
+++ b/r/tests/testthat/test-dataset-dplyr.R
@@ -178,7 +178,7 @@ test_that("filter scalar validation doesn't crash 
(ARROW-7772)", {
   ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
   expect_error(
     ds %>%
-      filter(int == "fff", part == 1) %>%
+      filter(int == Expression$scalar("fff"), part == 1) %>%
       collect(),
     "'equal' has no kernel matching input types .int32, string."
   )
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index a46eb8b6a9..5a587e9590 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -586,16 +586,25 @@ test_that("UnionDataset can merge schemas", {
     collect() %>%
     arrange(x)
   expect_equal(colnames(actual), c("x", "y", "z"))
-  expect_equal(
-    actual,
-    union_all(as_tibble(sub_df1), as_tibble(sub_df2))
+
+  union_all_common <- function(...) {
+    common <- vctrs::vec_cast_common(...)
+    rlang::inject(union_all(!!!common))
+  }
+
+  expected <- union_all_common(
+    as_tibble(sub_df1),
+    as_tibble(sub_df2)
   )
+  expect_equal(actual, expected)
 
   # without unifying schemas, takes the first schema and discards any columns
   # in the second which aren't in the first
   ds <- open_dataset(list(ds1, ds2), unify_schemas = FALSE)
-  expected <- as_tibble(sub_df1) %>%
-    union_all(sub_df2 %>% as_tibble() %>% select(x))
+  expected <- union_all_common(
+    as_tibble(sub_df1),
+    as_tibble(sub_df2) %>% select(x)
+  )
   actual <- ds %>%
     collect() %>%
     arrange(x)
diff --git a/r/tests/testthat/test-dplyr-distinct.R 
b/r/tests/testthat/test-dplyr-distinct.R
index b598d05890..09a8d5f8f5 100644
--- a/r/tests/testthat/test-dplyr-distinct.R
+++ b/r/tests/testthat/test-dplyr-distinct.R
@@ -25,6 +25,9 @@ test_that("distinct()", {
     .input %>%
       distinct(some_grouping, lgl) %>%
       collect() %>%
+      # GH-14947: column output order changed in dplyr 1.1.0, so we need
+      # to make the column order explicit until dplyr 1.1.0 is on CRAN
+      select(some_grouping, lgl) %>%
       arrange(some_grouping, lgl),
     tbl
   )
@@ -56,6 +59,9 @@ test_that("distinct() can retain groups", {
       group_by(some_grouping, int) %>%
       distinct(lgl) %>%
       collect() %>%
+      # GH-14947: column output order changed in dplyr 1.1.0, so we need
+      # to make the column order explicit until dplyr 1.1.0 is on CRAN
+      select(some_grouping, int, lgl) %>%
       arrange(lgl, int),
     tbl
   )
@@ -66,6 +72,9 @@ test_that("distinct() can retain groups", {
       group_by(y = some_grouping, int) %>%
       distinct(x = lgl) %>%
       collect() %>%
+      # GH-14947: column output order changed in dplyr 1.1.0, so we need
+      # to make the column order explicit until dplyr 1.1.0 is on CRAN
+      select(y, int, x) %>%
       arrange(int),
     tbl
   )
@@ -85,6 +94,9 @@ test_that("distinct() can contain expressions", {
       group_by(lgl, int) %>%
       distinct(x = some_grouping + 1) %>%
       collect() %>%
+      # GH-14947: column output order changed in dplyr 1.1.0, so we need
+      # to make the column order explicit until dplyr 1.1.0 is on CRAN
+      select(lgl, int, x) %>%
       arrange(int),
     tbl
   )
diff --git a/r/tests/testthat/test-dplyr-filter.R 
b/r/tests/testthat/test-dplyr-filter.R
index 281ae2abf0..24754afcf8 100644
--- a/r/tests/testthat/test-dplyr-filter.R
+++ b/r/tests/testthat/test-dplyr-filter.R
@@ -217,30 +217,12 @@ test_that("filter() with between()", {
       filter(dbl >= int, dbl <= dbl2)
   )
 
-  compare_dplyr_binding(
-    .input %>%
-      filter(between(dbl, 1, "2")) %>%
-      collect(),
-    tbl
-  )
-
   compare_dplyr_binding(
     .input %>%
       filter(between(dbl, 1, NA)) %>%
       collect(),
     tbl
   )
-
-  expect_warning(
-    compare_dplyr_binding(
-      .input %>%
-        filter(between(chr, 1, 2)) %>%
-        collect(),
-      tbl
-    ),
-    # the dplyr version warns:
-    "NAs introduced by coercion"
-  )
 })
 
 test_that("filter() with string ops", {
diff --git a/r/tests/testthat/test-dplyr-group-by.R 
b/r/tests/testthat/test-dplyr-group-by.R
index f17a013916..3c5d174b0c 100644
--- a/r/tests/testthat/test-dplyr-group-by.R
+++ b/r/tests/testthat/test-dplyr-group-by.R
@@ -299,7 +299,7 @@ test_that("Can use across() within group_by()", {
   test_groups <- c("dbl", "int", "chr")
   compare_dplyr_binding(
     .input %>%
-      group_by(across()) %>%
+      group_by(across(everything())) %>%
       collect(),
     tbl
   )
diff --git a/r/tests/testthat/test-dplyr-join.R 
b/r/tests/testthat/test-dplyr-join.R
index f0d3c8605d..cae741ae27 100644
--- a/r/tests/testthat/test-dplyr-join.R
+++ b/r/tests/testthat/test-dplyr-join.R
@@ -26,15 +26,14 @@ to_join <- tibble::tibble(
   another_column = TRUE
 )
 
-test_that("left_join", {
-  expect_message(
-    compare_dplyr_binding(
-      .input %>%
-        left_join(to_join) %>%
-        collect(),
-      left
-    ),
-    'Joining, by = "some_grouping"'
+test_that("left_join with automatic grouping", {
+  expect_identical(
+    as_record_batch(left) %>%
+      left_join(to_join) %>%
+      collect(),
+    left %>%
+      left_join(to_join, by = "some_grouping") %>%
+      collect()
   )
 })
 
@@ -174,14 +173,12 @@ test_that("full_join", {
 })
 
 test_that("semi_join", {
-  for (keep in c(TRUE, FALSE)) {
-    compare_dplyr_binding(
-      .input %>%
-        semi_join(to_join, by = "some_grouping", keep = !!keep) %>%
-        collect(),
-      left
-    )
-  }
+  compare_dplyr_binding(
+    .input %>%
+      semi_join(to_join, by = "some_grouping") %>%
+      collect(),
+    left
+  )
 })
 
 test_that("anti_join", {
diff --git a/r/tests/testthat/test-dplyr-query.R 
b/r/tests/testthat/test-dplyr-query.R
index 76a99d7c31..ee11cd6678 100644
--- a/r/tests/testthat/test-dplyr-query.R
+++ b/r/tests/testthat/test-dplyr-query.R
@@ -662,19 +662,9 @@ test_that("Scalars in expressions match the type of the 
field, if possible", {
     0
   )
 
-  # int == string, this works in dplyr and here too
-  expect_output(
-    tab %>%
-      filter(int == "5") %>%
-      show_exec_plan(),
-    "int == 5",
-    fixed = TRUE
-  )
-  expect_equal(
-    tab %>%
-      filter(int == "5") %>%
-      nrow(),
-    1
+  # int == string, errors starting in dplyr 1.1.0
+  expect_snapshot_warning(
+    tab %>% filter(int == "5")
   )
 
   # Strings automatically parsed to date/timestamp
diff --git a/r/tests/testthat/test-dplyr-slice.R 
b/r/tests/testthat/test-dplyr-slice.R
index 0db31c732f..9cef51d4f7 100644
--- a/r/tests/testthat/test-dplyr-slice.R
+++ b/r/tests/testthat/test-dplyr-slice.R
@@ -145,11 +145,11 @@ test_that("slice_* not supported with groups", {
     "Slicing grouped data not supported in Arrow"
   )
   expect_error(
-    slice_min(grouped, n = 5),
+    slice_min(grouped, int, n = 5),
     "Slicing grouped data not supported in Arrow"
   )
   expect_error(
-    slice_max(grouped, n = 5),
+    slice_max(grouped, int, n = 5),
     "Slicing grouped data not supported in Arrow"
   )
   expect_error(
diff --git a/r/tests/testthat/test-dplyr-summarize.R 
b/r/tests/testthat/test-dplyr-summarize.R
index 22356c68df..e54e57c836 100644
--- a/r/tests/testthat/test-dplyr-summarize.R
+++ b/r/tests/testthat/test-dplyr-summarize.R
@@ -298,20 +298,21 @@ test_that("n_distinct() on dataset", {
 })
 
 test_that("Functions that take ... but we only accept a single arg", {
-  compare_dplyr_binding(
-    .input %>%
-      summarize(distinct = n_distinct()) %>%
-      collect(),
-    tbl,
-    warning = "0 arguments"
+  # With zero arguments, n_distinct() will error in dplyr 1.1.0 too,
+  # so use a Dataset to avoid the "pulling data into R" step that would
+  # trigger a dplyr error
+  skip_if_not_available("dataset")
+  expect_snapshot(
+    InMemoryDataset$create(tbl) %>%
+      summarize(distinct = n_distinct()),
+    error = TRUE
   )
-  compare_dplyr_binding(
-    .input %>%
-      summarize(distinct = n_distinct(int, lgl)) %>%
-      collect(),
-    tbl,
-    warning = "Multiple arguments"
+
+  expect_snapshot_warning(
+    as_record_batch(tbl) %>%
+      summarize(distinct = n_distinct(int, lgl))
   )
+
   # Now that we've demonstrated that the whole machinery works, let's test
   # the agg_funcs directly
   expect_error(call_binding_agg("n_distinct"), "n_distinct() with 0 
arguments", fixed = TRUE)

[arrow] branch master updated: GH-14947 [R] Compatibility with dplyr 1.1.0 (#14948)

Reply via email to