thisisnic commented on code in PR #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r913528817
##########
r/R/util.R:
##########
@@ -215,3 +215,138 @@ handle_csv_read_error <- function(e, schema, call) {
is_compressed <- function(compression) {
!identical(compression, "uncompressed")
}
+
+parse_period_unit <- function(x) {
Review Comment:
I know not all the utility functions here are documented, but as this one
does quite a lot of work, would you mind adding a brief comment (doesn't need
to be a proper roxygen header) about what this does, in terms of its inputs and
outputs please?
##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -2309,3 +2308,618 @@ test_that("build_formats() and
build_format_from_order()", {
"%y%b%d%H%M%S", "%Y%b%d%H%M%S")
)
})
+
+
+
+# tests for datetime rounding ---------------------------------------------
+
+# an easy date to avoid conflating tests of different things
+easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC")
+easy_df <- tibble::tibble(datetime = easy_date)
+
+# dates near month boundaries over the course of 1 year
+month_boundaries <- c(
+ "2021-01-01 00:01:00", "2021-02-01 00:01:00", "2021-03-01 00:01:00",
+ "2021-04-01 00:01:00", "2021-05-01 00:01:00", "2021-06-01 00:01:00",
+ "2021-07-01 00:01:00", "2021-08-01 00:01:00", "2021-09-01 00:01:00",
+ "2021-10-01 00:01:00", "2021-11-01 00:01:00", "2021-12-01 00:01:00",
+ "2021-01-31 23:59:00", "2021-02-28 23:59:00", "2021-03-31 23:59:00",
+ "2021-04-30 23:59:00", "2021-05-31 23:59:00", "2021-06-30 23:59:00",
+ "2021-07-31 23:59:00", "2021-08-31 23:59:00", "2021-09-30 23:59:00",
+ "2021-10-31 23:59:00", "2021-11-30 23:59:00", "2021-12-31 23:59:00"
+)
+year_of_dates <- tibble::tibble(
+ datetime = as.POSIXct(month_boundaries, tz = "UTC"),
+ date = as.Date(datetime)
+)
+
+# test case used to check we catch week boundaries for all week_start values
+fortnight <- tibble::tibble(
+ date = as.Date(c(
+ "2022-04-04", # Monday
+ "2022-04-05", # Tuesday
+ "2022-04-06", # Wednesday
+ "2022-04-07", # Thursday
+ "2022-04-08", # Friday
+ "2022-04-09", # Saturday
+ "2022-04-10", # Sunday
+ "2022-04-11", # Monday
+ "2022-04-12", # Tuesday
+ "2022-04-13", # Wednesday
+ "2022-04-14", # Thursday
+ "2022-04-15", # Friday
+ "2022-04-16", # Saturday
+ "2022-04-17" # Sunday
+ )),
Review Comment:
We can probably shorten/simplify this with something like:
```
seq(from = as.Date("2022-04-04"), to = as.Date("2022-04-17"), by = "day")
```
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -615,4 +618,44 @@ register_bindings_datetime_parsers <- function() {
build_expr("assume_timezone", coalesce_output, options = list(timezone =
tz))
})
+
+}
+
+
+register_bindings_datetime_rounding <- function() {
+
+ register_binding("round_date", function(x, unit = "second",
+ week_start =
getOption("lubridate.week.start", 7)) {
+ opts <- parse_period_unit(unit)
+ if (opts$unit == 7L) { # weeks (unit = 7L) are special
+ return(shift_temporal_to_week("round_temporal", x, week_start, options =
opts))
+ }
+ Expression$create("round_temporal", x, options = opts)
+ })
+
+ register_binding("floor_date", function(x, unit = "second",
Review Comment:
I don't think it needs changing as it doesn't actually make any *practical*
difference due to the later matching, but just mentioning here in case it ever
comes up: this doesn't 100% match the lubridate default value as for some
reason, the value is `second` in `round_date()` but then `seconds` for
`floor_date()` and `ceiling_date()`.
##########
r/R/util.R:
##########
@@ -215,3 +215,138 @@ handle_csv_read_error <- function(e, schema, call) {
is_compressed <- function(compression) {
!identical(compression, "uncompressed")
}
+
+parse_period_unit <- function(x) {
+
+ # the regexp matches against fractional units, but per lubridate
+ # supports integer multiples of a known unit only
+ match_info <- regexpr(
+ pattern = " *(?<multiple>[0-9.,]+)? *(?<unit>[^ \t\n]+)",
+ text = x[[1]],
+ perl = TRUE
+ )
+
+ capture_start <- attr(match_info, "capture.start")
+ capture_length <- attr(match_info, "capture.length")
+ capture_end <- capture_start + capture_length - 1L
+
+ str_unit <- substr(x, capture_start[[2]], capture_end[[2]])
+ str_multiple <- substr(x, capture_start[[1]], capture_end[[1]])
+
+ known_units <- c("nanosecond", "microsecond", "millisecond", "second",
+ "minute", "hour", "day", "week", "month", "quarter", "year")
+
+ # match the period unit
+ str_unit_start <- substr(str_unit, 1, 3)
+ unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L
+
+ if (any(is.na(unit))) {
+ abort(sprintf("Unknown unit '%s'", str_unit))
+ }
+
+ # empty string in multiple interpreted as 1
+ if (capture_length[[1]] == 0) {
+ multiple <- 1L
+
+ } else {
+
+ # special cases: interpret fractions of 1 second as integer
+ # multiples of nanoseconds, microseconds, or milliseconds
+ # to mirror lubridate syntax
+ multiple <- as.numeric(str_multiple)
+
+ if (unit == 3L && multiple < 10^-6) {
+ unit <- 0L
+ multiple <- 10^9 * multiple
+ }
+ if (unit == 3L && multiple < 10^-3) {
+ unit <- 1L
+ multiple <- 10^6 * multiple
+ }
+ if (unit == 3L && multiple < 1) {
+ unit <- 2L
+ multiple <- 10^3 * multiple
+ }
+
+ multiple <- as.integer(multiple)
+ }
+
+
+ # more special cases: lubridate imposes sensible maximum
+ # values on the number of seconds, minutes and hours
+ if (unit == 3L && multiple > 60) {
+ abort("Rounding with second > 60 is not supported")
+ }
+ if (unit == 4L && multiple > 60) {
+ abort("Rounding with minute > 60 is not supported")
+ }
+ if (unit == 5L && multiple > 24) {
+ abort("Rounding with hour > 24 is not supported")
+ }
+
+ return(list(unit = unit, multiple = multiple))
+}
+
+
+# handles round/ceil/floor when unit is week and week_start is
+# a non-standard value (not Monday or Sunday)
+shift_temporal_to_week <- function(fn, x, week_start, options) {
Review Comment:
Please can you add in a brief usage example here just to make it more
skimmable?
##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -2309,3 +2308,618 @@ test_that("build_formats() and
build_format_from_order()", {
"%y%b%d%H%M%S", "%Y%b%d%H%M%S")
)
})
+
+
+
+# tests for datetime rounding ---------------------------------------------
+
+# an easy date to avoid conflating tests of different things
Review Comment:
Might be obvious, but what does "easy" mean in this context?
##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -2309,3 +2308,618 @@ test_that("build_formats() and
build_format_from_order()", {
"%y%b%d%H%M%S", "%Y%b%d%H%M%S")
)
})
+
+
+
+# tests for datetime rounding ---------------------------------------------
+
+# an easy date to avoid conflating tests of different things
+easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC")
+easy_df <- tibble::tibble(datetime = easy_date)
+
+# dates near month boundaries over the course of 1 year
+month_boundaries <- c(
+ "2021-01-01 00:01:00", "2021-02-01 00:01:00", "2021-03-01 00:01:00",
+ "2021-04-01 00:01:00", "2021-05-01 00:01:00", "2021-06-01 00:01:00",
+ "2021-07-01 00:01:00", "2021-08-01 00:01:00", "2021-09-01 00:01:00",
+ "2021-10-01 00:01:00", "2021-11-01 00:01:00", "2021-12-01 00:01:00",
+ "2021-01-31 23:59:00", "2021-02-28 23:59:00", "2021-03-31 23:59:00",
+ "2021-04-30 23:59:00", "2021-05-31 23:59:00", "2021-06-30 23:59:00",
+ "2021-07-31 23:59:00", "2021-08-31 23:59:00", "2021-09-30 23:59:00",
+ "2021-10-31 23:59:00", "2021-11-30 23:59:00", "2021-12-31 23:59:00"
+)
Review Comment:
This is the first and last day of each month, 1 minute before/after
midnight, right? We probably don't need to test on all of these; perhaps just
grab a subset that tend to cause issues, like around new year's eve, and the
last day of Feb.
##########
r/R/util.R:
##########
@@ -215,3 +215,138 @@ handle_csv_read_error <- function(e, schema, call) {
is_compressed <- function(compression) {
!identical(compression, "uncompressed")
}
+
+parse_period_unit <- function(x) {
+
+ # the regexp matches against fractional units, but per lubridate
+ # supports integer multiples of a known unit only
+ match_info <- regexpr(
+ pattern = " *(?<multiple>[0-9.,]+)? *(?<unit>[^ \t\n]+)",
+ text = x[[1]],
+ perl = TRUE
+ )
+
+ capture_start <- attr(match_info, "capture.start")
+ capture_length <- attr(match_info, "capture.length")
+ capture_end <- capture_start + capture_length - 1L
+
+ str_unit <- substr(x, capture_start[[2]], capture_end[[2]])
+ str_multiple <- substr(x, capture_start[[1]], capture_end[[1]])
+
+ known_units <- c("nanosecond", "microsecond", "millisecond", "second",
+ "minute", "hour", "day", "week", "month", "quarter", "year")
+
+ # match the period unit
+ str_unit_start <- substr(str_unit, 1, 3)
+ unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L
+
+ if (any(is.na(unit))) {
+ abort(sprintf("Unknown unit '%s'", str_unit))
+ }
Review Comment:
Given that we know roughly what the units are (well, the things that `unit`
should closely match anyway), perhaps we could add a bit more info to this
error message, kinda like the stuff suggested in the [tidyverse style guide
page on error messages](https://style.tidyverse.org/error-messages.html).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]