This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 95aec82bd6 ARROW-12693: [R] add unique() methods for ArrowTabular,
datasets (#13641)
95aec82bd6 is described below
commit 95aec82bd6080d8fcfedb4fa558d306e2a3dd7ec
Author: Sam Albers <[email protected]>
AuthorDate: Wed Jul 27 07:01:23 2022 -0400
ARROW-12693: [R] add unique() methods for ArrowTabular, datasets (#13641)
This implements are pretty thin version of `unique` and some tests:
``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
ir_ds <- arrow_table(iris) %>%
select(Species)
unique(ir_ds)
#> # A tibble: 3 × 1
#> Species
#> <fct>
#> 1 setosa
#> 2 versicolor
#> 3 virginica
```
Authored-by: Sam Albers <[email protected]>
Signed-off-by: Dewey Dunnington <[email protected]>
---
r/NAMESPACE | 4 ++++
r/R/dplyr.R | 22 ++++++++++++++++++++++
r/tests/testthat/_snaps/dataset.md | 4 ++++
r/tests/testthat/test-dataset.R | 37 +++++++++++++++++++++++++++++++++++++
4 files changed, 67 insertions(+)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 17f404caa1..c4c18ba16d 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -151,6 +151,10 @@ S3method(tail,RecordBatchReader)
S3method(tail,Scanner)
S3method(tail,arrow_dplyr_query)
S3method(unique,ArrowDatum)
+S3method(unique,ArrowTabular)
+S3method(unique,Dataset)
+S3method(unique,RecordBatchReader)
+S3method(unique,arrow_dplyr_query)
S3method(vec_ptype_abbr,arrow_fixed_size_binary)
S3method(vec_ptype_abbr,arrow_fixed_size_list)
S3method(vec_ptype_abbr,arrow_large_list)
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index c1fb4fef2b..dd6340c4f5 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -184,6 +184,28 @@ dim.arrow_dplyr_query <- function(x) {
c(rows, cols)
}
+#' @export
+unique.arrow_dplyr_query <- function(x, incomparables = FALSE, fromLast =
FALSE, ...) {
+
+ if (isTRUE(incomparables)) {
+ arrow_not_supported("`unique()` with `incomparables = TRUE`")
+ }
+
+ if (fromLast == TRUE) {
+ arrow_not_supported("`unique()` with `fromLast = TRUE`")
+ }
+
+ dplyr::distinct(x)
+}
+
+#' @export
+unique.Dataset <- unique.arrow_dplyr_query
+#' @export
+unique.ArrowTabular <- unique.arrow_dplyr_query
+#' @export
+unique.RecordBatchReader <- unique.arrow_dplyr_query
+
+
#' @export
as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional =
FALSE, ...) {
collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
diff --git a/r/tests/testthat/_snaps/dataset.md
b/r/tests/testthat/_snaps/dataset.md
new file mode 100644
index 0000000000..8831c31e3b
--- /dev/null
+++ b/r/tests/testthat/_snaps/dataset.md
@@ -0,0 +1,4 @@
+# unique()
+
+ `unique()` with `incomparables = TRUE` not supported in Arrow
+
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 3bcdd8bcde..d43bb492d0 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -799,6 +799,43 @@ test_that("head/tail", {
expect_error(tail(ds, -1)) # Not yet implemented
})
+
+test_that("unique()", {
+ ds <- open_dataset(dataset_dir)
+ in_r_mem <- rbind(df1, df2)
+ at <- arrow_table(in_r_mem)
+ rbr <- as_record_batch_reader(in_r_mem)
+
+ expect_s3_class(unique(ds), "arrow_dplyr_query")
+ expect_s3_class(unique(at), "arrow_dplyr_query")
+ expect_s3_class(unique(rbr), "arrow_dplyr_query")
+
+ # on a arrow_dplyr_query
+ adq_eg <- ds %>%
+ select(fct) %>%
+ unique()
+ expect_s3_class(adq_eg, "arrow_dplyr_query")
+
+ # order not set by distinct so some sorting required
+ expect_equal(sort(collect(unique(ds))$int), sort(unique(in_r_mem)$int))
+
+ # on a arrow table
+ expect_equal(
+ at %>%
+ unique() %>%
+ collect(),
+ unique(in_r_mem)
+ )
+ expect_equal(
+ rbr %>%
+ unique() %>%
+ collect(),
+ unique(in_r_mem)
+ )
+ expect_snapshot_error(unique(arrow_table(in_r_mem), incomparables = TRUE))
+})
+
+
test_that("Dataset [ (take by index)", {
ds <- open_dataset(dataset_dir)
# Taking only from one file