paleolimbot commented on pull request #11730:
URL: https://github.com/apache/arrow/pull/11730#issuecomment-984602422


   Nothing new yet, just listing the various ways this can fail.
   
   First, intermittent success!
   
   <details>
   
   ``` r
   library(arrow, warn.conflicts = FALSE)
   library(dplyr, warn.conflicts = FALSE)
   
   example_data <- tibble::tibble(
     int = c(1:3, NA_integer_, 5:10),
     dbl = c(1:8, NA, 10) + .1,
     dbl2 = rep(5, 10),
     lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
     false = logical(10),
     chr = letters[c(1:5, NA, 7:10)],
     fct = factor(letters[c(1:4, NA, NA, 7:10)])
   )
   
   tf <- tempfile()
   new_ds <- rbind(
     cbind(example_data, part = 1),
     cbind(example_data, part = 2),
     cbind(example_data, part = 3),
     cbind(example_data, part = 4)
   ) %>%
     mutate(row_order = 1:n())
   
   write_dataset(new_ds, tf, partitioning = "part")
   
   ds <- open_dataset(tf)
   
   waldo::compare(
     ds %>%
       to_duckdb() %>%
       # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
       select(-fct) %>%
       to_arrow() %>%
       filter(int > 5 & part > 1) %>%
       collect() %>%
       arrange(row_order) %>%
       tibble::as_tibble(),
     ds %>%
       select(-fct) %>%
       filter(int > 5 & part > 1) %>%
       collect() %>%
       arrange(row_order) %>%
       tibble::as_tibble()
   )
   #> ✓ No differences
   ```
   
   <sup>Created on 2021-12-02 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
   
   </details>
   
   Second, filter mismatch:
   
   <details>
   
   ``` r
   library(arrow, warn.conflicts = FALSE)
   library(dplyr, warn.conflicts = FALSE)
   
   example_data <- tibble::tibble(
     int = c(1:3, NA_integer_, 5:10),
     dbl = c(1:8, NA, 10) + .1,
     dbl2 = rep(5, 10),
     lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE),
     false = logical(10),
     chr = letters[c(1:5, NA, 7:10)],
     fct = factor(letters[c(1:4, NA, NA, 7:10)])
   )
   
   tf <- tempfile()
   new_ds <- rbind(
     cbind(example_data, part = 1),
     cbind(example_data, part = 2),
     cbind(example_data, part = 3),
     cbind(example_data, part = 4)
   ) %>%
     mutate(row_order = 1:n())
   
   write_dataset(new_ds, tf, partitioning = "part")
   
   ds <- open_dataset(tf)
   
   waldo::compare(
     ds %>%
       to_duckdb() %>%
       # factors don't roundtrip https://github.com/duckdb/duckdb/issues/1879
       select(-fct) %>%
       to_arrow() %>%
       filter(int > 5 & part > 1) %>%
       collect() %>%
       arrange(row_order) %>%
       tibble::as_tibble(),
     ds %>%
       select(-fct) %>%
       filter(int > 5 & part > 1) %>%
       collect() %>%
       arrange(row_order) %>%
       tibble::as_tibble()
   )
   #> old vs new
   #>             int  dbl   lgl  chr row_order part
   #> - old[1, ]    7  7.1  TRUE    g         0    3
   #> + new[1, ]    6  6.1  TRUE <NA>        16    2
   #> - old[2, ]    8  8.1    NA    h         0    3
   #> + new[2, ]    7  7.1  TRUE    g        17    2
   #> - old[3, ]    9   NA    NA    i         0    3
   #> + new[3, ]    8  8.1    NA    h        18    2
   #> - old[4, ]   10 10.1 FALSE    j         0    3
   #> + new[4, ]    9   NA    NA    i        19    2
   #> - old[5, ]    6  6.1  TRUE <NA>         4    3
   #> + new[5, ]   10 10.1 FALSE    j        20    2
   #> - old[6, ]    6  6.1  TRUE <NA>        16    2
   #> + new[6, ]    6  6.1  TRUE <NA>        26    3
   #> - old[7, ]    7  7.1  TRUE    g        17    2
   #> + new[7, ]    7  7.1  TRUE    g        27    3
   #> - old[8, ]    8  8.1    NA    h        18    2
   #> + new[8, ]    8  8.1    NA    h        28    3
   #> - old[9, ]    9   NA    NA    i        19    2
   #> + new[9, ]    9   NA    NA    i        29    3
   #> - old[10, ]  10 10.1 FALSE    j        20    2
   #> + new[10, ]  10 10.1 FALSE    j        30    3
   #>   old[11, ]   6  6.1  TRUE <NA>        36    4
   #>   old[12, ]   7  7.1  TRUE    g        37    4
   #>   old[13, ]   8  8.1    NA    h        38    4
   #> 
   #> `old$int[1:8]`: 7 8 9 10  6 6 7 8
   #> `new$int[1:8]`: 6 7 8  9 10 6 7 8
   #> 
   #> `old$dbl[1:8]`: 7 8 NA 10  6 6 7 8
   #> `new$dbl[1:8]`: 6 7  8 NA 10 6 7 8
   #> 
   #> `old$lgl[1:8]`: TRUE <NA> <NA> FALSE TRUE  TRUE TRUE <NA>
   #> `new$lgl[1:8]`: TRUE TRUE <NA> <NA>  FALSE TRUE TRUE <NA>
   #> 
   #> `old$chr[1:8]`: "g" "h" "i" "j" NA  NA "g" "h"
   #> `new$chr[1:8]`: NA  "g" "h" "i" "j" NA "g" "h"
   #> 
   #> `old$row_order[1:13]`:  0  0  0  0  4 16 17 18 19 20 and 3 more...
   #> `new$row_order[1:13]`: 16 17 18 19 20 26 27 28 29 30           ...
   #> 
   #> `old$part[1:13]`: 3 3 3 3 3 2 2 2 2 2 and 3 more...
   #> `new$part[1:13]`: 2 2 2 2 2 3 3 3 3 3           ...
   ```
   
   <sup>Created on 2021-12-02 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
   
   </details>
   
   Third, `Query Stream is closed`:
   
   <details>
   
   ```
   Error: IOError: Query Stream is closed
   /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/c/bridge.cc:1759 
 StatusFromCError(stream_.get_next(&stream_, &c_array))
   
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.h:222  
ReadNext(&batch)
   
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/util/iterator.h:428 
 it_.Next()
   
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/exec_plan.cc:417
  iterator_.Next()
   
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:326 
 ReadNext(&batch)
   
/Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:337 
 ReadAll(&batches) 
   ```
   
   </details>
   
   Fourth, segfault (only has happened once).


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to