Maarten Demeyer created ARROW-9606:
--------------------------------------

             Summary: %in% expressions don't work with >1 partition levels
                 Key: ARROW-9606
                 URL: https://issues.apache.org/jira/browse/ARROW-9606
             Project: Apache Arrow
          Issue Type: Bug
          Components: R
    Affects Versions: 1.0.0
         Environment: sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19041)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United Kingdom.1252 
#> [2] LC_CTYPE=English_United Kingdom.1252   
#> [3] LC_MONETARY=English_United Kingdom.1252
#> [4] LC_NUMERIC=C                           
#> [5] LC_TIME=English_United Kingdom.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.0      arrow_1.0.0.9000
            Reporter: Maarten Demeyer


library(arrow)
#> 
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

## Write files
pqdir <- file.path(tempdir(), paste(sample(letters, 6), collapse = ""))

for (foo in 0:1) {
  for (faa in 0:1) {
    fdir <- file.path(pqdir, letters[foo + 1], letters[faa + 1])
    dir.create(fdir, recursive = TRUE)
    rng <- (foo * 5 + faa + 1):(foo * 5 + faa + 5)
    write_parquet(data.frame(col = letters[rng]),
                         file.path(fdir, "file.parquet"))
  }
}

## What doesn't work: using %in% with both partitions defined
ds <- open_dataset(pqdir,
                   partitioning = schema(foo = string(), faa = string()))

collect(filter(ds, foo %in% "a"))
#> # A tibble: 0 x 3
#> # ... with 3 variables: col <chr>, foo <chr>, faa <chr>

## == does work
collect(filter(ds, foo == "a"))
#> # A tibble: 10 x 3
#>    col   foo   faa  
#>    <chr> <chr> <chr>
#>  1 a     a     a    
#>  2 b     a     a    
#>  3 c     a     a    
#>  4 d     a     a    
#>  5 e     a     a    
#>  6 b     a     b    
#>  7 c     a     b    
#>  8 d     a     b    
#>  9 e     a     b    
#> 10 f     a     b

## Declaring only one partition does work
ds <- open_dataset(pqdir, partitioning = schema(foo = string()))
collect(filter(ds, foo %in% "a"))
#> # A tibble: 10 x 2
#>    col   foo  
#>    <chr> <chr>
#>  1 a     a    
#>  2 b     a    
#>  3 c     a    
#>  4 d     a    
#>  5 e     a    
#>  6 b     a    
#>  7 c     a    
#>  8 d     a    
#>  9 e     a    
#> 10 f     a

## The lower-level API has the same problem
ds <- open_dataset(pqdir,
                   partitioning = schema(foo = string(), faa = string()))

flt <- Expression$in_(Expression$field_ref("foo"), Array$create("a"))

sc <- Scanner$create(ds, filter = flt)
sc$ToTable()
#> Table
#> 0 rows x 3 columns
#> $col <string>
#> $foo <string>
#> $faa <string>



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to