Maarten Demeyer created ARROW-9606:
--------------------------------------
Summary: %in% expressions don't work with >1 partition levels
Key: ARROW-9606
URL: https://issues.apache.org/jira/browse/ARROW-9606
Project: Apache Arrow
Issue Type: Bug
Components: R
Affects Versions: 1.0.0
Environment: sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19041)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United Kingdom.1252
#> [2] LC_CTYPE=English_United Kingdom.1252
#> [3] LC_MONETARY=English_United Kingdom.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United Kingdom.1252
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] dplyr_1.0.0 arrow_1.0.0.9000
Reporter: Maarten Demeyer
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
## Write files
pqdir <- file.path(tempdir(), paste(sample(letters, 6), collapse = ""))
for (foo in 0:1) {
for (faa in 0:1) {
fdir <- file.path(pqdir, letters[foo + 1], letters[faa + 1])
dir.create(fdir, recursive = TRUE)
rng <- (foo * 5 + faa + 1):(foo * 5 + faa + 5)
write_parquet(data.frame(col = letters[rng]),
file.path(fdir, "file.parquet"))
}
}
## What doesn't work: using %in% with both partitions defined
ds <- open_dataset(pqdir,
partitioning = schema(foo = string(), faa = string()))
collect(filter(ds, foo %in% "a"))
#> # A tibble: 0 x 3
#> # ... with 3 variables: col <chr>, foo <chr>, faa <chr>
## == does work
collect(filter(ds, foo == "a"))
#> # A tibble: 10 x 3
#> col foo faa
#> <chr> <chr> <chr>
#> 1 a a a
#> 2 b a a
#> 3 c a a
#> 4 d a a
#> 5 e a a
#> 6 b a b
#> 7 c a b
#> 8 d a b
#> 9 e a b
#> 10 f a b
## Declaring only one partition does work
ds <- open_dataset(pqdir, partitioning = schema(foo = string()))
collect(filter(ds, foo %in% "a"))
#> # A tibble: 10 x 2
#> col foo
#> <chr> <chr>
#> 1 a a
#> 2 b a
#> 3 c a
#> 4 d a
#> 5 e a
#> 6 b a
#> 7 c a
#> 8 d a
#> 9 e a
#> 10 f a
## The lower-level API has the same problem
ds <- open_dataset(pqdir,
partitioning = schema(foo = string(), faa = string()))
flt <- Expression$in_(Expression$field_ref("foo"), Array$create("a"))
sc <- Scanner$create(ds, filter = flt)
sc$ToTable()
#> Table
#> 0 rows x 3 columns
#> $col <string>
#> $foo <string>
#> $faa <string>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)