[jira] [Updated] (ARROW-9606) %in% expressions don't work with >1 partition levels

Maarten Demeyer (Jira) Fri, 31 Jul 2020 01:52:13 -0700


     [ 
https://issues.apache.org/jira/browse/ARROW-9606?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Maarten Demeyer updated ARROW-9606:
-----------------------------------
    Description: 
When filtering nested partitions using %in%, no rows are returned, both for 
Hive and non-Hive partitioning. == and other comparison operators do work, and 
the problem also goes away when only one partition level is declared in the 
schema.

This is not caused by the dplyr wrappers, the lower-level functions have the 
same problem.

{{library(arrow)
#> 
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

## Write files
pqdir <- file.path(tempdir(), paste(sample(letters, 6), collapse = ""))

for (foo in 0:1) {
  for (faa in 0:1) {
    fdir <- file.path(pqdir, letters[foo + 1], letters[faa + 1])
    dir.create(fdir, recursive = TRUE)
    rng <- (foo * 5 + faa + 1):(foo * 5 + faa + 5)
    write_parquet(data.frame(col = letters[rng]),
                         file.path(fdir, "file.parquet"))
  }
}

## What doesn't work: using %in% with both partitions defined
ds <- open_dataset(pqdir,
                   partitioning = schema(foo = string(), faa = string()))

collect(filter(ds, foo %in% "a"))
#> # A tibble: 0 x 3
#> # ... with 3 variables: col <chr>, foo <chr>, faa <chr>

## == does work
collect(filter(ds, foo == "a"))
#> # A tibble: 10 x 3
#>    col   foo   faa  
#>    <chr> <chr> <chr>
#>  1 a     a     a    
#>  2 b     a     a    
#>  3 c     a     a    
#>  4 d     a     a    
#>  5 e     a     a    
#>  6 b     a     b    
#>  7 c     a     b    
#>  8 d     a     b    
#>  9 e     a     b    
#> 10 f     a     b

## Declaring only one partition does work
ds <- open_dataset(pqdir, partitioning = schema(foo = string()))
collect(filter(ds, foo %in% "a"))
#> # A tibble: 10 x 2
#>    col   foo  
#>    <chr> <chr>
#>  1 a     a    
#>  2 b     a    
#>  3 c     a    
#>  4 d     a    
#>  5 e     a    
#>  6 b     a    
#>  7 c     a    
#>  8 d     a    
#>  9 e     a    
#> 10 f     a

## The lower-level API has the same problem
ds <- open_dataset(pqdir,
                   partitioning = schema(foo = string(), faa = string()))

flt <- Expression$in_(Expression$field_ref("foo"), Array$create("a"))

sc <- Scanner$create(ds, filter = flt)
sc$ToTable()
#> Table
#> 0 rows x 3 columns
#> $col <string>
#> $foo <string>
#> $faa <string>}}

  was:
library(arrow)
#> 
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

## Write files
pqdir <- file.path(tempdir(), paste(sample(letters, 6), collapse = ""))

for (foo in 0:1) {
  for (faa in 0:1) {
    fdir <- file.path(pqdir, letters[foo + 1], letters[faa + 1])
    dir.create(fdir, recursive = TRUE)
    rng <- (foo * 5 + faa + 1):(foo * 5 + faa + 5)
    write_parquet(data.frame(col = letters[rng]),
                         file.path(fdir, "file.parquet"))
  }
}

## What doesn't work: using %in% with both partitions defined
ds <- open_dataset(pqdir,
                   partitioning = schema(foo = string(), faa = string()))

collect(filter(ds, foo %in% "a"))
#> # A tibble: 0 x 3
#> # ... with 3 variables: col <chr>, foo <chr>, faa <chr>

## == does work
collect(filter(ds, foo == "a"))
#> # A tibble: 10 x 3
#>    col   foo   faa  
#>    <chr> <chr> <chr>
#>  1 a     a     a    
#>  2 b     a     a    
#>  3 c     a     a    
#>  4 d     a     a    
#>  5 e     a     a    
#>  6 b     a     b    
#>  7 c     a     b    
#>  8 d     a     b    
#>  9 e     a     b    
#> 10 f     a     b

## Declaring only one partition does work
ds <- open_dataset(pqdir, partitioning = schema(foo = string()))
collect(filter(ds, foo %in% "a"))
#> # A tibble: 10 x 2
#>    col   foo  
#>    <chr> <chr>
#>  1 a     a    
#>  2 b     a    
#>  3 c     a    
#>  4 d     a    
#>  5 e     a    
#>  6 b     a    
#>  7 c     a    
#>  8 d     a    
#>  9 e     a    
#> 10 f     a

## The lower-level API has the same problem
ds <- open_dataset(pqdir,
                   partitioning = schema(foo = string(), faa = string()))

flt <- Expression$in_(Expression$field_ref("foo"), Array$create("a"))

sc <- Scanner$create(ds, filter = flt)
sc$ToTable()
#> Table
#> 0 rows x 3 columns
#> $col <string>
#> $foo <string>
#> $faa <string>

    Environment: 
This is using the latest Github version using windows, but I also reproduce 
using the CRAN version and using Linux.

sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19041)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United Kingdom.1252 
#> [2] LC_CTYPE=English_United Kingdom.1252   
#> [3] LC_MONETARY=English_United Kingdom.1252
#> [4] LC_NUMERIC=C                           
#> [5] LC_TIME=English_United Kingdom.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.0      arrow_1.0.0.9000

  was:
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 19041)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United Kingdom.1252 
#> [2] LC_CTYPE=English_United Kingdom.1252   
#> [3] LC_MONETARY=English_United Kingdom.1252
#> [4] LC_NUMERIC=C                           
#> [5] LC_TIME=English_United Kingdom.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.0      arrow_1.0.0.9000


> %in% expressions don't work with >1 partition levels
> ----------------------------------------------------
>
>                 Key: ARROW-9606
>                 URL: https://issues.apache.org/jira/browse/ARROW-9606
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: R
>    Affects Versions: 1.0.0
>         Environment: This is using the latest Github version using windows, 
> but I also reproduce using the CRAN version and using Linux.
> sessionInfo()
> #> R version 4.0.2 (2020-06-22)
> #> Platform: x86_64-w64-mingw32/x64 (64-bit)
> #> Running under: Windows 10 x64 (build 19041)
> #> 
> #> Matrix products: default
> #> 
> #> locale:
> #> [1] LC_COLLATE=English_United Kingdom.1252 
> #> [2] LC_CTYPE=English_United Kingdom.1252   
> #> [3] LC_MONETARY=English_United Kingdom.1252
> #> [4] LC_NUMERIC=C                           
> #> [5] LC_TIME=English_United Kingdom.1252    
> #> 
> #> attached base packages:
> #> [1] stats     graphics  grDevices utils     datasets  methods   base     
> #> 
> #> other attached packages:
> #> [1] dplyr_1.0.0      arrow_1.0.0.9000
>            Reporter: Maarten Demeyer
>            Priority: Critical
>
> When filtering nested partitions using %in%, no rows are returned, both for 
> Hive and non-Hive partitioning. == and other comparison operators do work, 
> and the problem also goes away when only one partition level is declared in 
> the schema.
> This is not caused by the dplyr wrappers, the lower-level functions have the 
> same problem.
> {{library(arrow)
> #> 
> #> Attaching package: 'arrow'
> #> The following object is masked from 'package:utils':
> #> 
> #>     timestamp
> library(dplyr)
> #> 
> #> Attaching package: 'dplyr'
> #> The following objects are masked from 'package:stats':
> #> 
> #>     filter, lag
> #> The following objects are masked from 'package:base':
> #> 
> #>     intersect, setdiff, setequal, union
> ## Write files
> pqdir <- file.path(tempdir(), paste(sample(letters, 6), collapse = ""))
> for (foo in 0:1) {
>   for (faa in 0:1) {
>     fdir <- file.path(pqdir, letters[foo + 1], letters[faa + 1])
>     dir.create(fdir, recursive = TRUE)
>     rng <- (foo * 5 + faa + 1):(foo * 5 + faa + 5)
>     write_parquet(data.frame(col = letters[rng]),
>                          file.path(fdir, "file.parquet"))
>   }
> }
> ## What doesn't work: using %in% with both partitions defined
> ds <- open_dataset(pqdir,
>                    partitioning = schema(foo = string(), faa = string()))
> collect(filter(ds, foo %in% "a"))
> #> # A tibble: 0 x 3
> #> # ... with 3 variables: col <chr>, foo <chr>, faa <chr>
> ## == does work
> collect(filter(ds, foo == "a"))
> #> # A tibble: 10 x 3
> #>    col   foo   faa  
> #>    <chr> <chr> <chr>
> #>  1 a     a     a    
> #>  2 b     a     a    
> #>  3 c     a     a    
> #>  4 d     a     a    
> #>  5 e     a     a    
> #>  6 b     a     b    
> #>  7 c     a     b    
> #>  8 d     a     b    
> #>  9 e     a     b    
> #> 10 f     a     b
> ## Declaring only one partition does work
> ds <- open_dataset(pqdir, partitioning = schema(foo = string()))
> collect(filter(ds, foo %in% "a"))
> #> # A tibble: 10 x 2
> #>    col   foo  
> #>    <chr> <chr>
> #>  1 a     a    
> #>  2 b     a    
> #>  3 c     a    
> #>  4 d     a    
> #>  5 e     a    
> #>  6 b     a    
> #>  7 c     a    
> #>  8 d     a    
> #>  9 e     a    
> #> 10 f     a
> ## The lower-level API has the same problem
> ds <- open_dataset(pqdir,
>                    partitioning = schema(foo = string(), faa = string()))
> flt <- Expression$in_(Expression$field_ref("foo"), Array$create("a"))
> sc <- Scanner$create(ds, filter = flt)
> sc$ToTable()
> #> Table
> #> 0 rows x 3 columns
> #> $col <string>
> #> $foo <string>
> #> $faa <string>}}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

[jira] [Updated] (ARROW-9606) %in% expressions don't work with >1 partition levels

Reply via email to