djnavarro commented on code in PR #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r843493108


##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -650,6 +658,345 @@ test_that("extract yday from date", {
   )
 })
 
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+test_that("change_on_boundary is respected in ceiling_time", {
+
+  boundary_times <- tibble::tibble(
+    datetime = as.POSIXct(strptime(c(
+      "2022-05-10 00:00:00", # boundary for week (Sunday / week_start = 7)
+      "2022-03-10 00:00:00", # boundary for: day, hour, minute, second, 
millisecond
+      "2022-03-10 00:00:01", # boundary for: second, millisecond
+      "2022-03-10 00:01:00", # boundary for: second, millisecond, minute
+      "2022-03-10 01:00:00"  # boundary for: second, millisecond, minute, hour
+    ), tz="UTC", format = "%F %T"))
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day"),
+        out_2 = ceiling_date(datetime, "hour"),
+        out_3 = ceiling_date(datetime, "minute"),
+        out_4 = ceiling_date(datetime, "second"),
+        out_5 = ceiling_date(datetime, ".001 second")
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "hour", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "minute", change_on_boundary = FALSE),
+        out_4 = ceiling_date(datetime, "second", change_on_boundary = FALSE),
+        out_5 = ceiling_date(datetime, ".001 second", change_on_boundary = 
FALSE)
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day", change_on_boundary = TRUE),
+        out_2 = ceiling_date(datetime, "hour", change_on_boundary = TRUE),
+        out_3 = ceiling_date(datetime, "minute", change_on_boundary = TRUE),
+        out_4 = ceiling_date(datetime, "second", change_on_boundary = TRUE),
+        out_5 = ceiling_date(datetime, ".001 second", change_on_boundary = 
TRUE)
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  boundary_dates <- tibble::tibble(
+    date = as.Date(c(
+      "2001-05-10", # regular day
+      "2002-05-10", # regular day
+      "2003-05-10", # regular day
+      "2004-05-10", # regular day
+      "2005-05-10", # regular day
+      "2006-05-10", # regular day
+      "2007-05-10"  # regular day
+    ))
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(date, "day", change_on_boundary = FALSE),
+        out_2 = ceiling_date(date, "day", change_on_boundary = TRUE),
+        out_3 = ceiling_date(date, "day")
+      ) %>%
+      collect(),
+    boundary_dates
+  )

Review Comment:
   Hi @rok. Making progress with this, but pausing to ping you because this 
test currently throws a very strange error. The input here is a vector of dates 
with years 2001, 2002, 2003, etc. The output should contain the same vectors, 
because the rounding unit here is just to the nearest day, i.e.,
   
   ```
   2001-05-10,
   2002-05-10,
   2003-05-10,
   2004-05-10,
   etc
   ```
   
   what we actually get is the correct answer but with the unix epoch 
interleaved, i.e.,
   
   ```
   2001-05-10,
   1970-01-01,
   2002-05-10,
   1970-01-01,
   2003-05-10,
   etc
   ```
   
   My guess is that this must be something to do with how the data buffer is 
being parsed? I'm guessing a single 64-bit date in the C++ library has been 
treated as if it is two 32-bit dates when it's being pulled back into R. Any 
thoughts on what might be going on here?  
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to