boshek commented on code in PR #13641:
URL: https://github.com/apache/arrow/pull/13641#discussion_r927973046
##########
r/R/dplyr.R:
##########
@@ -184,6 +184,29 @@ dim.arrow_dplyr_query <- function(x) {
c(rows, cols)
}
+#' @export
+unique.arrow_dplyr_query <- function(x, incomparables = FALSE, fromLast =
FALSE, ...) {
+
+ if (incomparables == TRUE) {
+ arrow_not_supported("`unique()` with `incomparables = TRUE`")
+ }
+
+ if (fromLast == TRUE) {
+ arrow_not_supported("`unique()` with `fromLast = TRUE`")
+ }
+
+ x <- dplyr::distinct(x)
+ dplyr::collect(x)
Review Comment:
It comes from
[this](https://issues.apache.org/jira/browse/ARROW-12693?focusedCommentId=17568169&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17568169)
here. I think it is a bit of a grey area. My thinking was that base don't fall
into the lazy dbplyr paradigm but then I remember `head`. dbplyr does not have
`unique` so I don't think there is a precedent:
``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
arrow_iris <- arrow_table(iris)
duckdb_iris <- to_duckdb(arrow_iris)
## head
head(arrow_iris)
#> Table
#> 6 rows x 5 columns
#> $Sepal.Length <double>
#> $Sepal.Width <double>
#> $Petal.Length <double>
#> $Petal.Width <double>
#> $Species <dictionary<values=string, indices=int8>>
#>
#> See $metadata for additional Schema metadata
head(duckdb_iris)
#> # Source: SQL [6 x 5]
#> # Database: DuckDB 0.3.5-dev1410 [root@Darwin 21.6.0:R 4.2.1/:memory:]
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
## distinct
distinct(arrow_iris)
#> Table (query)
#> Sepal.Length: double
#> Sepal.Width: double
#> Petal.Length: double
#> Petal.Width: double
#> Species: dictionary<values=string, indices=int8>
#>
#> See $.data for the source Arrow object
distinct(duckdb_iris)
#> # Source: SQL [?? x 5]
#> # Database: DuckDB 0.3.5-dev1410 [root@Darwin 21.6.0:R 4.2.1/:memory:]
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> # … with more rows
#> # ℹ Use `print(n = ...)` to see more rows
##
unique(duckdb_iris)
#> [[1]]
#> src: DuckDB 0.3.5-dev1410 [root@Darwin 21.6.0:R 4.2.1/:memory:]
#> tbls:
#>
#> [[2]]
#> From: arrow_001
#> <Table: arrow_001>
```
I don't think users will have an expectation here so we are probably free to
decide.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]