nealrichardson commented on a change in pull request #10624:
URL: https://github.com/apache/arrow/pull/10624#discussion_r670405245
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,68 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # substr treats values as if they're on a continous number line, so values
+ # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
+ # this behavior
+ if (start <= 0) {
+ start <- 1
+ }
+
+ # if `stop` is lower than `start`, this is invalid, so set `stop` to
+ # 0 so that an empty string will be returned
Review comment:
```suggestion
# 0 so that an empty string will be returned (consistent with
base::substr())
```
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,68 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # substr treats values as if they're on a continous number line, so values
+ # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
+ # this behavior
+ if (start <= 0) {
+ start <- 1
+ }
+
+ # if `stop` is lower than `start`, this is invalid, so set `stop` to
+ # 0 so that an empty string will be returned
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- nse_funcs$substr
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(end) == 1,
+ msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (end == -1) {
+ end <- .Machine$integer.max
+ }
+
+ if (end < start) {
+ end <- 0
+ }
+
+
Review comment:
```suggestion
```
##########
File path: r/tests/testthat/test-dplyr-string-functions.R
##########
@@ -909,5 +895,176 @@ test_that("str_pad", {
collect(),
df
)
+})
+
+test_that("substr", {
+ df <- tibble(x = "Apache Arrow")
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 0, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, -1, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 6, 1)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, -1, -2)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 9, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 1, 6)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, 8, 12)) %>%
+ collect(),
+ df
+ )
+
+ expect_dplyr_equal(
+ input %>%
+ mutate(y = substr(x, -5, -1)) %>%
+ collect(),
+ df
+ )
+
+ expect_error(
+ nse_funcs$substr("Apache Arrow", c(1, 2), 3),
+ "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ expect_error(
+ nse_funcs$substr("Apache Arrow", 1, c(2, 3)),
+ "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+})
+
+test_that("substring", {
+ df <- tibble(x = "Apache Arrow")
Review comment:
```suggestion
# nse_funcs$substring just calls nse_funcs$substr, tested extensively above
df <- tibble(x = "Apache Arrow")
```
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,68 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # substr treats values as if they're on a continous number line, so values
+ # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
+ # this behavior
+ if (start <= 0) {
+ start <- 1
+ }
+
+ # if `stop` is lower than `start`, this is invalid, so set `stop` to
+ # 0 so that an empty string will be returned
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- nse_funcs$substr
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(end) == 1,
+ msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (end == -1) {
+ end <- .Machine$integer.max
+ }
+
+ if (end < start) {
+ end <- 0
+ }
+
+
+ if (start > 0) {
+ start <- start - 1L
Review comment:
These deserve explanatory comments, particularly noting the differences
in behavior among base::substr, stringr::str_sub, and arrow's
utf8_slice_codeunits. Basically a concise version of what you answered me in
the PR review. (It's a good rule of thumb that if I ask you why something is a
certain way because it wasn't obvious to me, the answer should probably be a
code comment and not (just) a PR comment--if it's not obvious to the reader
now, it definitely won't be obvious to us in 6 months or whenever we have to
revise this code and wonder why things are munged a certain way.)
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,68 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ # substr treats values as if they're on a continous number line, so values
+ # 0 are effectively blank characters - set `start` to 1 here so Arrow mimics
+ # this behavior
+ if (start <= 0) {
+ start <- 1
+ }
+
+ # if `stop` is lower than `start`, this is invalid, so set `stop` to
+ # 0 so that an empty string will be returned
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
+ )
+}
+
+nse_funcs$substring <- nse_funcs$substr
Review comment:
This won't exactly work because the arguments are named differently
##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both",
"left", "right")) {
Expression$create(trim_fun, string)
}
+nse_funcs$substr <- function(string, start, stop) {
+ assert_that(
+ length(start) == 1,
+ msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+ )
+ assert_that(
+ length(stop) == 1,
+ msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+ )
+
+ if (start <= 0) {
+ start <- 1
+ }
+
+ if (stop < start) {
+ stop <- 0
+ }
+
+ Expression$create(
+ "utf8_slice_codeunits",
+ string,
+ options = list(start = start - 1L, stop = stop)
Review comment:
This deserves an explanatory comment
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]