wjones127 commented on PR #15131:
URL: https://github.com/apache/arrow/pull/15131#issuecomment-1368064616
Double checked, and @thisisnic's repros are all fixed by this:
``` r
library(dplyr, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
# Specific conditions where this happens: a table with one NA and 64 or more
non-NA values
test_df = tibble::tibble(x = c(NA, rep("foo", 64)))
test_arrow = arrow_table(test_df)
# the non-arrow version; all the final values are 1
test_df %>%
mutate(y = case_when(x == 'foo' ~ 1, is.na(x) ~ NA_real_)) %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <chr> <dbl>
#> 1 foo 1
#> 2 foo 1
#> 3 foo 1
#> 4 foo 1
#> 5 foo 1
#> 6 foo 1
# the arrow version; the final values is NA
test_arrow %>%
mutate(y = case_when(x == 'foo' ~ 1, is.na(x) ~ NA_real_)) %>%
collect() %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <chr> <dbl>
#> 1 foo 1
#> 2 foo 1
#> 3 foo 1
#> 4 foo 1
#> 5 foo 1
#> 6 foo 1
# it's fine if there are less than 65 values in the table (i.e. but still
contains an NA)
test_arrow[1:64,] %>%
mutate(y = case_when(x == 'foo' ~ 1, is.na(x) ~ NA_real_)) %>%
collect() %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <chr> <dbl>
#> 1 foo 1
#> 2 foo 1
#> 3 foo 1
#> 4 foo 1
#> 5 foo 1
#> 6 foo 1
# everything is fine when the comparison is being done on doubles and return
value is char
test_df2 = tibble::tibble(x = c(NA, rep(1, 64)))
test_arrow2 = arrow_table(test_df2)
test_arrow2 %>%
mutate(y = case_when(x == 1 ~ "winning", is.na(x) ~ NA_character_)) %>%
collect() %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <dbl> <chr>
#> 1 1 winning
#> 2 1 winning
#> 3 1 winning
#> 4 1 winning
#> 5 1 winning
#> 6 1 winning
# also breaks when source value is boolean and target value is double
test_df3 = tibble::tibble(x = c(NA, rep(TRUE, 64)))
test_arrow3 = arrow_table(test_df3)
test_arrow3 %>%
mutate(y = case_when(x == TRUE ~ 1, is.na(x) ~ NA_real_)) %>%
collect() %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <lgl> <dbl>
#> 1 TRUE 1
#> 2 TRUE 1
#> 3 TRUE 1
#> 4 TRUE 1
#> 5 TRUE 1
#> 6 TRUE 1
# also broken for when target is integer
test_df4 = tibble::tibble(x = c(NA, rep(TRUE, 64)))
test_arrow4 = arrow_table(test_df4)
test_arrow4 %>%
mutate(y = case_when(x == TRUE ~ 1L, is.na(x) ~ 2L)) %>%
collect() %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <lgl> <int>
#> 1 TRUE 1
#> 2 TRUE 1
#> 3 TRUE 1
#> 4 TRUE 1
#> 5 TRUE 1
#> 6 TRUE 1
# broken for logical to logical
test_df5 = tibble::tibble(x = c(NA, rep(TRUE, 64)))
test_arrow5 = arrow_table(test_df5)
test_arrow5 %>%
mutate(y = case_when(x == TRUE ~ TRUE, is.na(x) ~ FALSE)) %>%
collect() %>%
tail()
#> # A tibble: 6 × 2
#> x y
#> <lgl> <lgl>
#> 1 TRUE TRUE
#> 2 TRUE TRUE
#> 3 TRUE TRUE
#> 4 TRUE TRUE
#> 5 TRUE TRUE
#> 6 TRUE TRUE
```
<sup>Created on 2022-12-30 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]