nealrichardson commented on a change in pull request #10624:
URL: https://github.com/apache/arrow/pull/10624#discussion_r669913235



##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both", 
"left", "right")) {
   Expression$create(trim_fun, string)
 }
 
+nse_funcs$substr <- function(string, start, stop) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(stop) == 1,
+    msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start <= 0) {
+    start <- 1
+  }
+
+  if (stop < start) {
+    stop <- 0

Review comment:
       Can you explain these? It's not obvious why this is correct, and the 
base R code and docs don't discuss these cases.

##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both", 
"left", "right")) {
   Expression$create(trim_fun, string)
 }
 
+nse_funcs$substr <- function(string, start, stop) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(stop) == 1,
+    msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start <= 0) {
+    start <- 1
+  }
+
+  if (stop < start) {
+    stop <- 0
+  }
+
+  Expression$create(
+    "utf8_slice_codeunits",
+    string,
+    options = list(start = start - 1L, stop = stop)
+  )
+}
+
+nse_funcs$substring <- function(text, first, last = 1000000L) {

Review comment:
       You could just implement this one by calling nse_funcs$substr. The 
validation messages won't be exactly right because the argument names are 
different, but per the docs this function is only kept for compatibility with 
S, so I don't think it's a big deal.

##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both", 
"left", "right")) {
   Expression$create(trim_fun, string)
 }
 
+nse_funcs$substr <- function(string, start, stop) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(stop) == 1,
+    msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start <= 0) {
+    start <- 1
+  }
+
+  if (stop < start) {
+    stop <- 0
+  }
+
+  Expression$create(
+    "utf8_slice_codeunits",
+    string,
+    options = list(start = start - 1L, stop = stop)
+  )
+}
+
+nse_funcs$substring <- function(text, first, last = 1000000L) {
+  assert_that(
+    length(first) == 1,
+    msg = "`first` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(last) == 1,
+    msg = "`last` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (first <= 0) {
+    first <- 1
+  }
+
+  if (last < first) {
+    last <- 0
+  }
+
+  Expression$create(
+    "utf8_slice_codeunits",
+    text,
+    options = list(start = first - 1L, stop = last)
+  )
+}
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(end) == 1,
+    msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start == 0) start <- 1
+
+  if (end == -1) end <- .Machine$integer.max
+
+  if (end < start) end <- 0
+
+  if (start > 0) start <- start - 1L

Review comment:
       Why does this version subtract 1 up here but the others don't? Why only 
if start > 0? Is start <= 0 valid?

##########
File path: r/R/dplyr-functions.R
##########
@@ -391,7 +470,7 @@ nse_funcs$str_split <- function(string, pattern, n = Inf, 
simplify = FALSE) {
     string,
     options = list(
       pattern =
-      opts$pattern,
+        opts$pattern,

Review comment:
       I know this is just linting but IDK why opts$pattern is on its own line 
instead of next to = above it.

##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both", 
"left", "right")) {
   Expression$create(trim_fun, string)
 }
 
+nse_funcs$substr <- function(string, start, stop) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(stop) == 1,
+    msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start <= 0) {
+    start <- 1
+  }
+
+  if (stop < start) {
+    stop <- 0
+  }
+
+  Expression$create(
+    "utf8_slice_codeunits",
+    string,
+    options = list(start = start - 1L, stop = stop)

Review comment:
       Why do we only subtract 1 from start but not stop?

##########
File path: r/R/dplyr-functions.R
##########
@@ -280,6 +284,81 @@ nse_funcs$str_trim <- function(string, side = c("both", 
"left", "right")) {
   Expression$create(trim_fun, string)
 }
 
+nse_funcs$substr <- function(string, start, stop) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(stop) == 1,
+    msg = "`stop` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start <= 0) {
+    start <- 1
+  }
+
+  if (stop < start) {
+    stop <- 0
+  }
+
+  Expression$create(
+    "utf8_slice_codeunits",
+    string,
+    options = list(start = start - 1L, stop = stop)
+  )
+}
+
+nse_funcs$substring <- function(text, first, last = 1000000L) {
+  assert_that(
+    length(first) == 1,
+    msg = "`first` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(last) == 1,
+    msg = "`last` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (first <= 0) {
+    first <- 1
+  }
+
+  if (last < first) {
+    last <- 0
+  }
+
+  Expression$create(
+    "utf8_slice_codeunits",
+    text,
+    options = list(start = first - 1L, stop = last)
+  )
+}
+
+nse_funcs$str_sub <- function(string, start = 1L, end = -1L) {
+  assert_that(
+    length(start) == 1,
+    msg = "`start` must be length 1 - other lengths are not supported in Arrow"
+  )
+  assert_that(
+    length(end) == 1,
+    msg = "`end` must be length 1 - other lengths are not supported in Arrow"
+  )
+
+  if (start == 0) start <- 1

Review comment:
       Please use braces even for short if statements

##########
File path: r/tests/testthat/test-dplyr-string-functions.R
##########
@@ -909,5 +896,206 @@ test_that("str_pad", {
       collect(),
     df
   )
+})
+
+test_that("substr", {
+  df <- tibble(x = "Apache Arrow")
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, 0, 6)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, -1, 6)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, 6, 1)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, -1, -2)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, 9, 6)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, 1, 6)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, 8, 12)) %>%
+      collect(),
+    df
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      mutate(y = substr(x, -5, -1)) %>%
+      collect(),
+    df
+  )
+})
+
+test_that("substring", {

Review comment:
       If you made substring just call substr then you could delete all but one 
of these tests.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to