paleolimbot commented on issue #33094:
URL: https://github.com/apache/arrow/issues/33094#issuecomment-1405237489
Ok, I can reproduce with only these tests running. I don't think they're all
needed but wanted to catalogue since it took quite a while to get to this point.
```r
library(arrow)
withr::local_options(list(
arrow.summarise.sort = TRUE,
rlib_warning_verbosity = "verbose",
# This prevents the warning in `summarize()` about having grouped output
without
# also specifying what to do with `.groups`
dplyr.summarise.inform = FALSE
))
library(dplyr, warn.conflicts = FALSE)
library(stringr)
tbl <- example_data
# Add some better string data
tbl$verses <- verses[[1]]
# c(" a ", " b ", " c ", ...) increasing padding
# nchar = 3 5 7 9 11 13 15 17 19 21
tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) +
1, side = "both")
tbl$some_grouping <- rep(c(1, 2), 5)
test_that("min() and max() on character strings", {
compare_dplyr_binding(
.input %>%
summarize(
min_chr = min(chr, na.rm = TRUE),
max_chr = max(chr, na.rm = TRUE)
) %>%
collect(),
tbl,
)
compare_dplyr_binding(
.input %>%
group_by(fct) %>%
summarize(
min_chr = min(chr, na.rm = TRUE),
max_chr = max(chr, na.rm = TRUE)
) %>%
arrange(min_chr) %>%
collect(),
tbl,
)
})
test_that("Do things after summarize", {
group2_sum <- tbl %>%
group_by(some_grouping) %>%
filter(int > 5) %>%
summarize(total = sum(int, na.rm = TRUE)) %>%
pull() %>%
tail(1)
compare_dplyr_binding(
.input %>%
group_by(some_grouping) %>%
filter(int > 5) %>%
summarize(total = sum(int, na.rm = TRUE)) %>%
filter(total == group2_sum) %>%
mutate(extra = total * 5) %>%
collect(),
tbl
)
compare_dplyr_binding(
.input %>%
filter(dbl > 2) %>%
select(chr, int, lgl) %>%
mutate(twice = int * 2L) %>%
group_by(lgl) %>%
summarize(
count = n(),
total = sum(twice, na.rm = TRUE)
) %>%
mutate(mean = total / count) %>%
collect(),
tbl
)
})
test_that("Non-field variable references in aggregations", {
tab <- arrow_table(x = 1:5)
scale_factor <- 10
expect_identical(
tab %>%
summarize(value = sum(x) / scale_factor) %>%
collect(),
tab %>%
summarize(value = sum(x) / 10) %>%
collect()
)
})
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]