jonkeane commented on a change in pull request #12506:
URL: https://github.com/apache/arrow/pull/12506#discussion_r828003665



##########
File path: r/R/dplyr-funcs-datetime.R
##########
@@ -189,6 +189,65 @@ register_bindings_datetime <- function() {
   })
 }
 
+register_bindings_duration <- function() {
+  register_binding("difftime", function(time1,
+                                        time2,
+                                        tz,
+                                        units = c("auto", "secs", "mins",
+                                                  "hours", "days", "weeks")) {
+    units <- match.arg(units)
+    if (units != "secs") {
+      abort("`difftime()` with units other than seconds not supported in 
Arrow")
+    }
+
+    if (!missing(tz)) {
+      warn("`tz` is an optional argument to `difftime()` in R and will not be 
used in Arrow")
+    }

Review comment:
       I don't think we need to say anything here other than "The `tz` argument 
is not supported in Arrow, so it is being ignored". No need to add anything 
about if it is optional in base R 

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -974,3 +974,181 @@ test_that("date() errors with unsupported inputs", {
     regexp = "Unsupported cast from double to date32 using function 
cast_date32"
   )
 })
+
+test_that("difftime works correctly", {
+  test_df <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0")
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36")
+      ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(time1, time2, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  # units other than "secs" not supported in arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        mins = difftime(time1, time2, units = "mins")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        hours = difftime(time1, time2, units = "hours")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        days = difftime(time1, time2, units = "days")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        weeks = difftime(time1, time2, units = "weeks")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  skip_on_os("windows")
+  test_df_with_tz <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"),
+      tz = "Europe/London"
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36"),
+      tz = "America/Chicago"
+    ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(secs2 = difftime(time2, time1, units = "secs")) %>%
+      collect(),
+    test_df_with_tz
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(
+          as.POSIXct("2022-03-07", tz = "Europe/Bucharest"),
+          time1,
+          units = "secs"
+        )
+      ) %>%
+      collect(),
+    test_df_with_tz
+  )
+
+  # `tz` is effectively ignored both in R (used only if inputs are POSIXlt) 
and Arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(secs2 = difftime(time2, time1, units = "secs", tz = 
"Pacific/Marquesas")) %>%
+      collect(),
+    test_df_with_tz,
+    warning = "`tz` is an optional argument"
+  )
+})
+
+test_that("as.difftime() works properly", {
+  test_df <- tibble(
+    hms_string = c("0:7:45", "12:34:56"),
+    hm_string = c("7:45", "12:34"),
+    int = c(30L, 75L),
+    integerish_dbl = c(31, 76),
+    dbl = c(31.2, 76.4)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(hms_difftime = as.difftime(hms_string, units = "secs")) %>%
+      collect(),
+    test_df
+  )
+
+  # TODO add test with `format` mismatch returning NA once
+  # https://issues.apache.org/jira/browse/ARROW-15659 is solved
+  # for example: as.difftime("07:", format = "%H:%M") should return NA
+  compare_dplyr_binding(
+    .input %>%
+      mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = 
"%H:%M")) %>%
+      collect(),
+    test_df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(int_difftime = as.difftime(int, units = "secs")) %>%
+      collect(),
+    test_df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(integerish_dbl_difftime = as.difftime(integerish_dbl, units = 
"secs")) %>%
+      collect(),
+    test_df
+  )
+
+  # "mins" or other values for units cannot be handled in Arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(int_difftime = as.difftime(int, units = "mins")) %>%
+      collect(),
+    test_df,
+    warning = TRUE
+  )
+
+  # only integer (or integer-like) -> duration supported in Arrow.
+  # double -> duration not supported
+  expect_error(
+    test_df %>%
+      arrow_table() %>%
+      mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>%
+      collect()
+  )
+})

Review comment:
       I see the convo with Nic about this — I'm on the fence about testing the 
C++ error here at all, but I'm fine leaving it in. Though I do think the 
comment should explicitly explain _why_ we're not testing the contents of the 
error message (since that is unusual + we should almost always avoid 
`expect_error()` without any kind of assertion at all, so if we're not doing 
that in this case, then we should be loud + clear why we've made that choice)

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -974,3 +974,181 @@ test_that("date() errors with unsupported inputs", {
     regexp = "Unsupported cast from double to date32 using function 
cast_date32"
   )
 })
+
+test_that("difftime works correctly", {
+  test_df <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0")
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36")
+      ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(time1, time2, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  # units other than "secs" not supported in arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        mins = difftime(time1, time2, units = "mins")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        hours = difftime(time1, time2, units = "hours")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        days = difftime(time1, time2, units = "days")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        weeks = difftime(time1, time2, units = "weeks")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  skip_on_os("windows")
+  test_df_with_tz <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"),
+      tz = "Europe/London"
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36"),
+      tz = "America/Chicago"
+    ),

Review comment:
       Could we use some more improbable timezones here? We try and use as 
unusual timezones as possible so that we avoid running into local timezones 
that just so happen to work locally and then get up to CI where many things are 
UTC and fail (or on a user's computer in some TZ and things fail)

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -974,3 +974,181 @@ test_that("date() errors with unsupported inputs", {
     regexp = "Unsupported cast from double to date32 using function 
cast_date32"
   )
 })
+
+test_that("difftime works correctly", {
+  test_df <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0")
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36")
+      ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(time1, time2, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  # units other than "secs" not supported in arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        mins = difftime(time1, time2, units = "mins")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        hours = difftime(time1, time2, units = "hours")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        days = difftime(time1, time2, units = "days")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        weeks = difftime(time1, time2, units = "weeks")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  skip_on_os("windows")
+  test_df_with_tz <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"),
+      tz = "Europe/London"
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36"),
+      tz = "America/Chicago"
+    ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(secs2 = difftime(time2, time1, units = "secs")) %>%
+      collect(),
+    test_df_with_tz
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(
+          as.POSIXct("2022-03-07", tz = "Europe/Bucharest"),
+          time1,
+          units = "secs"
+        )
+      ) %>%
+      collect(),
+    test_df_with_tz
+  )
+
+  # `tz` is effectively ignored both in R (used only if inputs are POSIXlt) 
and Arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(secs2 = difftime(time2, time1, units = "secs", tz = 
"Pacific/Marquesas")) %>%
+      collect(),
+    test_df_with_tz,
+    warning = "`tz` is an optional argument"
+  )
+})
+
+test_that("as.difftime() works properly", {

Review comment:
       ```suggestion
   test_that("as.difftime()", {
   ```
   
   I think we can be a little bit more terse here (and it means we don't have 
to remember which phrasing of "works properly" "works correctly" we use

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -974,3 +974,181 @@ test_that("date() errors with unsupported inputs", {
     regexp = "Unsupported cast from double to date32 using function 
cast_date32"
   )
 })
+
+test_that("difftime works correctly", {
+  test_df <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0")
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36")
+      ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(time1, time2, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  # units other than "secs" not supported in arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        mins = difftime(time1, time2, units = "mins")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        hours = difftime(time1, time2, units = "hours")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        days = difftime(time1, time2, units = "days")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        weeks = difftime(time1, time2, units = "weeks")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )

Review comment:
       We probably only need one of these other types of units, especially 
since we are basically testing base R at that point.

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -974,3 +974,181 @@ test_that("date() errors with unsupported inputs", {
     regexp = "Unsupported cast from double to date32 using function 
cast_date32"
   )
 })
+
+test_that("difftime works correctly", {
+  test_df <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0")
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36")
+      ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(time1, time2, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(as.POSIXct("2022-03-07"), time1, units = "secs")
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = TRUE
+  )
+
+  # units other than "secs" not supported in arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        mins = difftime(time1, time2, units = "mins")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        hours = difftime(time1, time2, units = "hours")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        days = difftime(time1, time2, units = "days")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        weeks = difftime(time1, time2, units = "weeks")
+      ) %>%
+      collect(),
+    test_df,
+    warning = TRUE,
+    ignore_attr = TRUE
+  )
+
+  skip_on_os("windows")
+  test_df_with_tz <- tibble(
+    time1 = as.POSIXct(
+      c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"),
+      tz = "Europe/London"
+    ),
+    time2 = as.POSIXct(
+      c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", 
"2021-01-31 00:07:36"),
+      tz = "America/Chicago"
+    ),
+    secs = c(121L, 234L, 345L, 456L)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(secs2 = difftime(time2, time1, units = "secs")) %>%
+      collect(),
+    test_df_with_tz
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        secs2 = difftime(
+          as.POSIXct("2022-03-07", tz = "Europe/Bucharest"),
+          time1,
+          units = "secs"
+        )
+      ) %>%
+      collect(),
+    test_df_with_tz
+  )

Review comment:
       I appreciate the thoroughness of this, but I think we're covered with a 
test like this on 999-1007. I would say pick one or the other to keep and drop 
the other




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to