[GitHub] [arrow] jonkeane commented on a diff in pull request #12154: ARROW-14821: [R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

GitBox Tue, 28 Jun 2022 11:18:58 -0700


jonkeane commented on code in PR #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r908796810



##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -46,6 +46,46 @@ test_df <- tibble::tibble(
   integer = 1:2
 )
 
+# a simplified test case using UTC timezone only
+test_df_v2 <- tibble::tibble(
+  datetime = c(as.POSIXct("2017-01-01 00:00:11.3456789", tz = "UTC"), NA) + 1,

Review Comment:
   I wonder if (1) we want to test non-UTC times too? Would that change 
anything? and (2) if we do use a timezone that's not UTC, we should also have 
times that would be on either side of midnight on different days _in_ UTC for 
cases where we rollup to days, we want to make sure that does the right thing.



##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -1965,3 +2005,385 @@ test_that("lubridate's fast_strptime", {
       collect()
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+check_boundary_with_unit <- function(unit, ...) {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL),
+        cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE),
+        cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times,
+    ...
+  )
+}
+
+test_that("ceiling_time works with change_on_boundary: unit = day", {
+  check_boundary_with_unit("day")
+})
+test_that("ceiling_time works with change_on_boundary: unit = hour", {
+  check_boundary_with_unit("hour")
+})
+test_that("ceiling_time works with change_on_boundary: unit = minute", {
+  check_boundary_with_unit("minute", tolerance = .001) # floating point issue?
+})
+test_that("ceiling_time works with change_on_boundary: unit = second", {
+  check_boundary_with_unit("second")
+})
+test_that("ceiling_time works with change_on_boundary: unit = millisecond", {
+  check_boundary_with_unit(".001 second")
+})
+
+
+# NOTE: until 16142 is resolved, this test has to be written to avoid the
+# 32-bit temporal array misinterpreted as 64-bit bug. The easiest solution
+# is to never use an arrow array of length greater than 1:
+# https://issues.apache.org/jira/browse/ARROW-16142

Review Comment:
   Hmm is this in reference to the test just below it?



##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -686,4 +689,61 @@ register_bindings_datetime_parsers <- function() {
 
     build_expr("assume_timezone", coalesce_output, options = list(timezone = 
tz))
   })
+
+}
+
+
+register_bindings_datetime_rounding <- function() {
+
+  register_binding("round_date", function(x, unit = "second",
+                                          week_start = 
getOption("lubridate.week.start", 7)) {
+    opts <- parse_period_unit(unit)
+
+    if (opts$unit == 7L) {
+      return(shift_temporal_to_week("round_temporal", x, week_start, options = 
opts))
+    }

Review Comment:
   Would it be possible to add a comment about which unit is `7L` here so it's 
easier to follow which one gets this special treatment?



##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -1965,3 +2005,385 @@ test_that("lubridate's fast_strptime", {
       collect()
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+check_boundary_with_unit <- function(unit, ...) {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL),
+        cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE),
+        cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times,
+    ...
+  )
+}

Review Comment:
   This test helper is fantastic, it might be worth adding a small comment 
about what it does (it's pretty obvious, but I've thought that before and 
future-me has proven myself wrong!)



##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -686,4 +689,61 @@ register_bindings_datetime_parsers <- function() {
 
     build_expr("assume_timezone", coalesce_output, options = list(timezone = 
tz))
   })
+
+}
+
+
+register_bindings_datetime_rounding <- function() {
+
+  register_binding("round_date", function(x, unit = "second",
+                                          week_start = 
getOption("lubridate.week.start", 7)) {
+    opts <- parse_period_unit(unit)
+
+    if (opts$unit == 7L) {
+      return(shift_temporal_to_week("round_temporal", x, week_start, options = 
opts))
+    }
+
+    Expression$create("round_temporal", x, options = opts)
+  })
+
+  register_binding("floor_date", function(x, unit = "second",
+                                          week_start = 
getOption("lubridate.week.start", 7)) {
+
+    opts <- parse_period_unit(unit)
+
+    if (opts$unit == 7L) {
+      return(shift_temporal_to_week("floor_temporal", x, week_start, options = 
opts))
+    }
+
+    return(Expression$create("floor_temporal", x, options = opts))
+  })
+
+  register_binding("ceiling_date", function(x, unit = "second",
+                                            change_on_boundary = NULL,
+                                            week_start = 
getOption("lubridate.week.start", 7)) {
+
+    opts <- parse_period_unit(unit)
+
+    if (is.null(change_on_boundary)) {
+      if (call_binding("is.Date", x)) {
+        change_on_boundary <- TRUE
+      } else {
+        change_on_boundary <- FALSE
+      }
+    }
+
+    if (change_on_boundary == FALSE) {
+      opts$ceil_is_strictly_greater <- FALSE
+    }
+    if (change_on_boundary == TRUE) {
+      opts$ceil_is_strictly_greater <- TRUE
+    }

Review Comment:
   An if/else might make this look slight more unified and that it is an either 
or option (though it's pretty obvious from the code that you're overwritting 
this variable)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] jonkeane commented on a diff in pull request #12154: ARROW-14821: [R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

Reply via email to