[ 
https://issues.apache.org/jira/browse/ARROW-18102?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17622235#comment-17622235
 ] 

Adam Black edited comment on ARROW-18102 at 10/21/22 12:41 PM:
---------------------------------------------------------------

Here a reprex using SQLite in R. As a user I would expect the behavior to be 
similar with FileSystemDataset objects.

 

{{``` r}}
{{library(DBI)}}
{{library(dplyr)}}

{{con <- dbConnect(RSQLite::SQLite(), ":memory:")}}

{{zero_row_table <- cars %>% filter(speed < 0)}}
{{nrow(zero_row_table)}}
{{#> [1] 0}}

{{dbWriteTable(con, "zero_row_table", zero_row_table)}}

{{dbGetQuery(con, "select count(1) as n from zero_row_table")}}
{{#>   n}}
{{#> 1 0}}

{{tbl(con, "zero_row_table") %>% }}
{{  count()}}
{{#> # Source:   SQL [1 x 1]}}
{{#> # Database: sqlite 3.39.4 [:memory:]}}
{{#>       n}}
{{#>   <int>}}
{{#> 1     0}}

{{tbl(con, "zero_row_table") %>% }}
{{  count() %>% }}
{{  show_query()}}
{{#> <SQL>}}
{{#> SELECT COUNT * AS `n`}}
{{#> FROM `zero_row_table`}}

{{tbl(con, "zero_row_table") %>% }}
{{  tally()}}
{{#> # Source:   SQL [1 x 1]}}
{{#> # Database: sqlite 3.39.4 [:memory:]}}
{{#>       n}}
{{#>   <int>}}
{{#> 1     0}}

{{tbl(con, "zero_row_table") %>% }}
{{  tally() %>% }}
{{  show_query()}}
{{#> <SQL>}}
{{#> SELECT COUNT * AS `n`}}
{{{}#> FROM `zero_row_table`{}}}{{{}dbDisconnect(con){}}}
{{```}}


was (Author: JIRAUSER289460):
Here a reprex using SQLite in R. As a user I would expect the behavior to be 
similar with FileSystemDataset objects.

 

{{``` r}}
{{library(DBI)}}
{{library(dplyr)}}

{{{}con <- dbConnect(RSQLite::SQLite(), ":memory:"){}}}{{{}zero_row_table <- 
cars %>% filter(speed < 0){}}}
{{nrow(zero_row_table)}}
{{{}#> [1] 0{}}}{{{}dbWriteTable(con, "zero_row_table", 
zero_row_table){}}}{{{}dbGetQuery(con, "select count(1) as n from 
zero_row_table"){}}}
{{#>   n}}
{{{}#> 1 0{}}}{{{}tbl(con, "zero_row_table") %>% {}}}
{{  count()}}
{{#> # Source:   SQL [1 x 1]}}
{{#> # Database: sqlite 3.39.4 [:memory:]}}
{{#>       n}}
{{#>   <int>}}
{{{}#> 1     0{}}}{{{}tbl(con, "zero_row_table") %>% {}}}
{{  count() %>% }}
{{  show_query()}}
{{#> <SQL>}}
{{#> SELECT COUNT(*) AS `n`}}
{{{}#> FROM `zero_row_table`{}}}{{{}tbl(con, "zero_row_table") %>% {}}}
{{  tally()}}
{{#> # Source:   SQL [1 x 1]}}
{{#> # Database: sqlite 3.39.4 [:memory:]}}
{{#>       n}}
{{#>   <int>}}
{{{}#> 1     0{}}}{{{}tbl(con, "zero_row_table") %>% {}}}
{{  tally() %>% }}
{{  show_query()}}
{{#> <SQL>}}
{{#> SELECT COUNT(*) AS `n`}}
{{{}#> FROM `zero_row_table`{}}}{{{}dbDisconnect(con){}}}
{{```}}

> [R] dplyr::count and dplyr::tally implementation return NA instead of 0
> -----------------------------------------------------------------------
>
>                 Key: ARROW-18102
>                 URL: https://issues.apache.org/jira/browse/ARROW-18102
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: R
>         Environment: Arrow R package 9.0.0 on Mac OS 12.6 with R 4.2.0
>            Reporter: Adam Black
>            Priority: Major
>
> I'm using dplyr with FileSystemDataset objects. The expected behavior is 
> similar (or the same as) dataframe behavior. When the FileSystemDataset has 
> zero rows dplyr::count and dplyr::tally return NA instead of 0. I would 
> expect the result to be 0.
>  
> {code:r}
> library(arrow)
> #> 
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #> 
> #>     timestamp
> library(dplyr)
> #> 
> #> Attaching package: 'dplyr'
> #> The following objects are masked from 'package:stats':
> #> 
> #>     filter, lag
> #> The following objects are masked from 'package:base':
> #> 
> #>     intersect, setdiff, setequal, union
> path <- tempfile(fileext = ".feather")
> zero_row_dataset <- cars %>% filter(dist < 0)
> # expected behavior
> zero_row_dataset %>% 
>   count()
> #>   n
> #> 1 0
> zero_row_dataset %>% 
>   tally()
> #>   n
> #> 1 0
> nrow(zero_row_dataset)
> #> [1] 0
> # now test behavior with a FileSystemDataset
> write_feather(zero_row_dataset, path)
> ds <- open_dataset(path, format = "feather")
> ds
> #> FileSystemDataset with 1 Feather file
> #> speed: double
> #> dist: double
> #> 
> #> See $metadata for additional Schema metadata
> # actual behavior
> ds %>% 
>   count() %>% 
>   collect() # incorrect result
> #> # A tibble: 1 × 1
> #>       n
> #>   <int>
> #> 1    NA
> ds %>% 
>   tally() %>% 
>   collect() # incorrect result
> #> # A tibble: 1 × 1
> #>       n
> #>   <int>
> #> 1    NA
> nrow(ds) # works as expected
> #> [1] 0
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to