This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 95aec82bd6 ARROW-12693: [R] add unique() methods for ArrowTabular, 
datasets (#13641)
95aec82bd6 is described below

commit 95aec82bd6080d8fcfedb4fa558d306e2a3dd7ec
Author: Sam Albers <[email protected]>
AuthorDate: Wed Jul 27 07:01:23 2022 -0400

    ARROW-12693: [R] add unique() methods for ArrowTabular, datasets (#13641)
    
    This implements are pretty thin version of `unique` and some tests:
    
    ``` r
    library(arrow, warn.conflicts = FALSE)
    library(dplyr, warn.conflicts = FALSE)
    
    ir_ds <- arrow_table(iris) %>%
      select(Species)
    unique(ir_ds)
    #> # A tibble: 3 × 1
    #>   Species
    #>   <fct>
    #> 1 setosa
    #> 2 versicolor
    #> 3 virginica
    ```
    
    Authored-by: Sam Albers <[email protected]>
    Signed-off-by: Dewey Dunnington <[email protected]>
---
 r/NAMESPACE                        |  4 ++++
 r/R/dplyr.R                        | 22 ++++++++++++++++++++++
 r/tests/testthat/_snaps/dataset.md |  4 ++++
 r/tests/testthat/test-dataset.R    | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 17f404caa1..c4c18ba16d 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -151,6 +151,10 @@ S3method(tail,RecordBatchReader)
 S3method(tail,Scanner)
 S3method(tail,arrow_dplyr_query)
 S3method(unique,ArrowDatum)
+S3method(unique,ArrowTabular)
+S3method(unique,Dataset)
+S3method(unique,RecordBatchReader)
+S3method(unique,arrow_dplyr_query)
 S3method(vec_ptype_abbr,arrow_fixed_size_binary)
 S3method(vec_ptype_abbr,arrow_fixed_size_list)
 S3method(vec_ptype_abbr,arrow_large_list)
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index c1fb4fef2b..dd6340c4f5 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -184,6 +184,28 @@ dim.arrow_dplyr_query <- function(x) {
   c(rows, cols)
 }
 
+#' @export
+unique.arrow_dplyr_query <- function(x, incomparables = FALSE, fromLast = 
FALSE, ...) {
+
+  if (isTRUE(incomparables)) {
+    arrow_not_supported("`unique()` with `incomparables = TRUE`")
+  }
+
+  if (fromLast == TRUE) {
+    arrow_not_supported("`unique()` with `fromLast = TRUE`")
+  }
+
+  dplyr::distinct(x)
+}
+
+#' @export
+unique.Dataset <- unique.arrow_dplyr_query
+#' @export
+unique.ArrowTabular <- unique.arrow_dplyr_query
+#' @export
+unique.RecordBatchReader <- unique.arrow_dplyr_query
+
+
 #' @export
 as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = 
FALSE, ...) {
   collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
diff --git a/r/tests/testthat/_snaps/dataset.md 
b/r/tests/testthat/_snaps/dataset.md
new file mode 100644
index 0000000000..8831c31e3b
--- /dev/null
+++ b/r/tests/testthat/_snaps/dataset.md
@@ -0,0 +1,4 @@
+# unique()
+
+    `unique()` with `incomparables = TRUE` not supported in Arrow
+
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 3bcdd8bcde..d43bb492d0 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -799,6 +799,43 @@ test_that("head/tail", {
   expect_error(tail(ds, -1)) # Not yet implemented
 })
 
+
+test_that("unique()", {
+  ds <- open_dataset(dataset_dir)
+  in_r_mem <- rbind(df1, df2)
+  at <- arrow_table(in_r_mem)
+  rbr <- as_record_batch_reader(in_r_mem)
+
+  expect_s3_class(unique(ds), "arrow_dplyr_query")
+  expect_s3_class(unique(at), "arrow_dplyr_query")
+  expect_s3_class(unique(rbr), "arrow_dplyr_query")
+
+  # on a arrow_dplyr_query
+  adq_eg <- ds %>%
+    select(fct) %>%
+    unique()
+  expect_s3_class(adq_eg, "arrow_dplyr_query")
+
+  # order not set by distinct so some sorting required
+  expect_equal(sort(collect(unique(ds))$int), sort(unique(in_r_mem)$int))
+
+  # on a arrow table
+  expect_equal(
+    at %>%
+      unique() %>%
+      collect(),
+    unique(in_r_mem)
+  )
+  expect_equal(
+    rbr %>%
+      unique() %>%
+      collect(),
+    unique(in_r_mem)
+  )
+  expect_snapshot_error(unique(arrow_table(in_r_mem), incomparables = TRUE))
+})
+
+
 test_that("Dataset [ (take by index)", {
   ds <- open_dataset(dataset_dir)
   # Taking only from one file

Reply via email to