This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 29cc263068 ARROW-17085: [R] group_vars() should not return NULL
(#13621)
29cc263068 is described below
commit 29cc263068b983e690879d4d768025439a0fdd47
Author: eitsupi <[email protected]>
AuthorDate: Sat Jul 16 01:06:57 2022 +0900
ARROW-17085: [R] group_vars() should not return NULL (#13621)
If an ungrouped data.frame or an `arrow_dplyr_query` is given to
`dplyr::group_vars()`, `character()` returns.
But for an ungrouped Table, `NULL` is returned.
```r
mtcars |> dplyr::group_vars()
#> character(0)
mtcars |> arrow:::as_adq() |> dplyr::group_vars()
#> character(0)
mtcars |> arrow::arrow_table() |> dplyr::group_vars()
#> NULL
```
Therefore, functions that expect `group_vars` to return character, such as
the following, will fail.
```r
mtcars |> arrow::arrow_table() |> dtplyr::lazy_dt()
#> Error in new_step(parent, vars = names(parent), groups = groups, locals
= list(), : is.character(groups) is not TRUE
```
This PR modifies `dplyr::group_vars()` and `dplyr::groups()` for Arrow
objects to work the same as for data.frame.
(Note that `arrow_dplyr_query` already works the same way as data.frame.)
Lead-authored-by: SHIMA Tatsuya <[email protected]>
Co-authored-by: eitsupi <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/R/dplyr-group-by.R | 8 ++++----
r/R/dplyr.R | 2 +-
r/tests/testthat/test-RecordBatch.R | 10 ++++++++--
r/tests/testthat/test-Table.R | 8 +++++++-
r/tests/testthat/test-metadata.R | 2 +-
5 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 250dbedb18..c650799e8d 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -58,13 +58,13 @@ group_by.arrow_dplyr_query <- function(.data,
group_by.Dataset <- group_by.ArrowTabular <- group_by.RecordBatchReader <-
group_by.arrow_dplyr_query
groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x))
-groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <-
function(x) NULL
+groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <-
groups.arrow_dplyr_query
group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
-group_vars.Dataset <- function(x) NULL
-group_vars.RecordBatchReader <- function(x) NULL
+group_vars.Dataset <- function(x) character()
+group_vars.RecordBatchReader <- function(x) character()
group_vars.ArrowTabular <- function(x) {
- x$metadata$r$attributes$.group_vars
+ x$metadata$r$attributes$.group_vars %||% character()
}
# the logical literal in the two functions below controls the default value of
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index b048d98018..1296e60384 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -42,7 +42,7 @@ arrow_dplyr_query <- function(.data) {
gv <- tryCatch(
# If dplyr is not available, or if the input doesn't have a group_vars
# method, assume no group vars
- dplyr::group_vars(.data) %||% character(),
+ dplyr::group_vars(.data),
error = function(e) character()
)
diff --git a/r/tests/testthat/test-RecordBatch.R
b/r/tests/testthat/test-RecordBatch.R
index e7602d9f74..6b79325934 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -654,7 +654,7 @@ test_that("Handling string data with embedded nuls", {
})
})
-test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch
creation", {
+test_that("ARROW-11769/ARROW-13860/ARROW-17085 - grouping preserved in record
batch creation", {
skip_if_not_available("dataset")
library(dplyr, warn.conflicts = FALSE)
@@ -670,6 +670,12 @@ test_that("ARROW-11769/ARROW-13860 - grouping preserved in
record batch creation
record_batch(),
"RecordBatch"
)
+ expect_identical(
+ tbl %>%
+ record_batch() %>%
+ group_vars(),
+ group_vars(tbl)
+ )
expect_identical(
tbl %>%
group_by(fct, fct2) %>%
@@ -683,7 +689,7 @@ test_that("ARROW-11769/ARROW-13860 - grouping preserved in
record batch creation
record_batch() %>%
ungroup() %>%
group_vars(),
- NULL
+ character()
)
expect_identical(
tbl %>%
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 5edba2cd4a..bafd183108 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -592,7 +592,7 @@ test_that("cbind.Table handles record batches and tables", {
)
})
-test_that("ARROW-11769 - grouping preserved in table creation", {
+test_that("ARROW-11769/ARROW-17085 - grouping preserved in table creation", {
skip_if_not_available("dataset")
tbl <- tibble::tibble(
@@ -601,6 +601,12 @@ test_that("ARROW-11769 - grouping preserved in table
creation", {
fct2 = factor(rep(c("C", "D"), each = 5)),
)
+ expect_identical(
+ tbl %>%
+ Table$create() %>%
+ dplyr::group_vars(),
+ dplyr::group_vars(tbl)
+ )
expect_identical(
tbl %>%
dplyr::group_by(fct, fct2) %>%
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 69475ccd39..21b7ebe11a 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -382,7 +382,7 @@ test_that("grouped_df metadata is recorded (efficiently)", {
expect_s3_class(grouped, "grouped_df")
grouped_tab <- Table$create(grouped)
expect_r6_class(grouped_tab, "Table")
- expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a")
+ expect_equal(grouped_tab$metadata$r$attributes$.group_vars, "a")
})
test_that("grouped_df non-arrow metadata is preserved", {