Maarten Demeyer created ARROW-9606: -------------------------------------- Summary: %in% expressions don't work with >1 partition levels Key: ARROW-9606 URL: https://issues.apache.org/jira/browse/ARROW-9606 Project: Apache Arrow Issue Type: Bug Components: R Affects Versions: 1.0.0 Environment: sessionInfo() #> R version 4.0.2 (2020-06-22) #> Platform: x86_64-w64-mingw32/x64 (64-bit) #> Running under: Windows 10 x64 (build 19041) #> #> Matrix products: default #> #> locale: #> [1] LC_COLLATE=English_United Kingdom.1252 #> [2] LC_CTYPE=English_United Kingdom.1252 #> [3] LC_MONETARY=English_United Kingdom.1252 #> [4] LC_NUMERIC=C #> [5] LC_TIME=English_United Kingdom.1252 #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] dplyr_1.0.0 arrow_1.0.0.9000 Reporter: Maarten Demeyer
library(arrow) #> #> Attaching package: 'arrow' #> The following object is masked from 'package:utils': #> #> timestamp library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union ## Write files pqdir <- file.path(tempdir(), paste(sample(letters, 6), collapse = "")) for (foo in 0:1) { for (faa in 0:1) { fdir <- file.path(pqdir, letters[foo + 1], letters[faa + 1]) dir.create(fdir, recursive = TRUE) rng <- (foo * 5 + faa + 1):(foo * 5 + faa + 5) write_parquet(data.frame(col = letters[rng]), file.path(fdir, "file.parquet")) } } ## What doesn't work: using %in% with both partitions defined ds <- open_dataset(pqdir, partitioning = schema(foo = string(), faa = string())) collect(filter(ds, foo %in% "a")) #> # A tibble: 0 x 3 #> # ... with 3 variables: col <chr>, foo <chr>, faa <chr> ## == does work collect(filter(ds, foo == "a")) #> # A tibble: 10 x 3 #> col foo faa #> <chr> <chr> <chr> #> 1 a a a #> 2 b a a #> 3 c a a #> 4 d a a #> 5 e a a #> 6 b a b #> 7 c a b #> 8 d a b #> 9 e a b #> 10 f a b ## Declaring only one partition does work ds <- open_dataset(pqdir, partitioning = schema(foo = string())) collect(filter(ds, foo %in% "a")) #> # A tibble: 10 x 2 #> col foo #> <chr> <chr> #> 1 a a #> 2 b a #> 3 c a #> 4 d a #> 5 e a #> 6 b a #> 7 c a #> 8 d a #> 9 e a #> 10 f a ## The lower-level API has the same problem ds <- open_dataset(pqdir, partitioning = schema(foo = string(), faa = string())) flt <- Expression$in_(Expression$field_ref("foo"), Array$create("a")) sc <- Scanner$create(ds, filter = flt) sc$ToTable() #> Table #> 0 rows x 3 columns #> $col <string> #> $foo <string> #> $faa <string> -- This message was sent by Atlassian Jira (v8.3.4#803005)