nealrichardson commented on a change in pull request #10624:
URL: https://github.com/apache/arrow/pull/10624#discussion_r669913235
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start <= 0) {
+ start <- 1
+ }
+
+ if (stop < start) {
+ stop <- 0
Review comment:
Can you explain these? It's not obvious why this is correct, and the
base R code and docs don't discuss these cases.
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start <= 0) {
+ start <- 1
+ }
+
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- function(text, first, last = 1000000L) {
Review comment:
You could just implement this one by calling nse_funcs$substr. The
validation messages won't be exactly right because the argument names are
different, but per the docs this function is only kept for compatibility with
S, so I don't think it's a big deal.
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start <= 0) {
+ start <- 1
+ }
+
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- function(text, first, last = 1000000L) {
+ assert_that(
+ length(first) == 1,
+ msg = "`first` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(last) == 1,
+ msg = "`last` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (first <= 0) {
+ first <- 1
+ }
+
+ if (last < first) {
+ last <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ text,
+ options = list(start = first - 1L, stop = last)
+ )
+}
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(end) == 1,
+ msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start == 0) start <- 1
+
+ if (end == -1) end <- .Machine$integer.max
+
+ if (end < start) end <- 0
+
+ if (start > 0) start <- start - 1L
Review comment:
Why does this version subtract 1 up here but the others don't? Why only
if start > 0? Is start <= 0 valid?
##########
File path: r/R/dplyr-functions.R
##########
@@ -391,7 +470,7 @@ nse_funcs$str_split <- function(string, pattern, n = Inf,
simplify = FALSE) {
string,
options = list(
pattern =
- opts$pattern,
+ opts$pattern,
Review comment:
I know this is just linting but IDK why opts$pattern is on its own line
instead of next to = above it.
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start <= 0) {
+ start <- 1
+ }
+
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
Review comment:
Why do we only subtract 1 from start but not stop?
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start <= 0) {
+ start <- 1
+ }
+
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- function(text, first, last = 1000000L) {
+ assert_that(
+ length(first) == 1,
+ msg = "`first` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(last) == 1,
+ msg = "`last` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (first <= 0) {
+ first <- 1
+ }
+
+ if (last < first) {
+ last <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ text,
+ options = list(start = first - 1L, stop = last)
+ )
+}
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(end) == 1,
+ msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start == 0) start <- 1
Review comment:
Please use braces even for short if statements
##########
File path: r/tests/testthat/test-dplyr-string-functions.R
##########
@@ -909,5 +896,206 @@ test_that("str_pad", {
collect(),
df
)
+})
+
+test_that("substr", {
+ df <- tibble(x = "Apache Arrow")
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 0, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, -1, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 6, 1)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, -1, -2)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 9, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 8, 12)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, -5, -1)) %>%
+ collect(),
+ df
+ )
+})
+
+test_that("substring", {
Review comment:
If you made substring just call substr then you could delete all but one
of these tests.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]