This is an automated email from the ASF dual-hosted git repository. kou pushed a commit to branch maint-6.0.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 03197788f8d47dcff577761a574c2ab2d48f4a24 Author: Dragos Moldovan-Grünfeld <[email protected]> AuthorDate: Fri Oct 22 15:47:33 2021 +0100 ARROW-13156 [R] bindings for str_count Closes #11473 from dragosmg/ARROW-13156_str_count_bindings Lead-authored-by: Dragos Moldovan-Grünfeld <[email protected]> Co-authored-by: Dragoș Moldovan-Grünfeld <[email protected]> Signed-off-by: Nic Crane <[email protected]> --- r/R/dplyr-functions.R | 13 +++++++ r/tests/testthat/test-dplyr-funcs-string.R | 60 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index dbb9d5f..717cdae 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -645,6 +645,19 @@ nse_funcs$str_ends <- function(string, pattern, negate = FALSE) { out } +nse_funcs$str_count <- function(string, pattern) { + opts <- get_stringr_pattern_options(enexpr(pattern)) + if (!is.string(pattern)) { + arrow_not_supported("`pattern` must be a length 1 character vector; other values") + } + arrow_fun <- ifelse(opts$fixed, "count_substring", "count_substring_regex") + Expression$create( + arrow_fun, + string, + options = list(pattern = opts$pattern, ignore_case = opts$ignore_case) + ) +} + # String function helpers # format `pattern` as needed for case insensitivity and literal matching by RE2 diff --git a/r/tests/testthat/test-dplyr-funcs-string.R b/r/tests/testthat/test-dplyr-funcs-string.R index dd59b5a..333735b 100644 --- a/r/tests/testthat/test-dplyr-funcs-string.R +++ b/r/tests/testthat/test-dplyr-funcs-string.R @@ -1336,3 +1336,63 @@ test_that("str_starts, str_ends, startsWith, endsWith", { df ) }) + +test_that("str_count", { + df <- tibble( + cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), + dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") + ) + + expect_dplyr_equal( + input %>% + mutate(a_count = str_count(cities, pattern = "a")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(p_count = str_count(cities, pattern = "d")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(p_count = str_count(cities, + pattern = regex("d", ignore_case = TRUE) + )) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(e_count = str_count(cities, pattern = "u")) %>% + collect(), + df + ) + + # nse_funcs$str_count() is not vectorised over pattern + expect_dplyr_equal( + input %>% + mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% + collect(), + df, + warning = TRUE + ) + + expect_dplyr_equal( + input %>% + mutate(dots_count = str_count(dots, ".")) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate(dots_count = str_count(dots, fixed("."))) %>% + collect(), + df + ) +})
