rok commented on a change in pull request #12154: URL: https://github.com/apache/arrow/pull/12154#discussion_r785142400
########## File path: r/R/util.R ########## @@ -209,3 +209,74 @@ handle_csv_read_error <- function(e, schema) { abort(e) } + + +parse_period_unit <- function(x) { + + # the regexp matches against fractional units, but per lubridate + # supports integer multiples of a known unit only + match_info <- regexpr( + pattern = " *(?<multiple>[0-9.,]+)? *(?<unit>[^ \t\n]+)", + text = x[[1]], + perl = TRUE + ) + + capture_start <- attr(match_info, "capture.start") + capture_length <- attr(match_info, "capture.length") + capture_end <- capture_start + capture_length - 1L + + str_unit <- substr(x, capture_start[[2]], capture_end[[2]]) + str_multiple <- substr(x, capture_start[[1]], capture_end[[1]]) + + known_units <- c("nanosecond", "microsecond", "millisecond", "second", + "minute", "hour", "day", "week", "month", "quarter", "year") + + # match the period unit + str_unit_start <- substr(str_unit, 1, 3) + unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L + + if(any(is.na(unit))) { + abort(sprintf("Unknown unit '%s'", str_unit)) + } + + # empty string in multiple interpreted as 1 + if(capture_length[[1]] == 0) { + multiple <- 1L + + } else { + + # special cases: interpret fractions of 1 second as integer + # multiples of nanoseconds, microseconds, or milliseconds + # to mirror lubridate syntax + multiple <- as.numeric(str_multiple) + + if(unit == 3L & multiple < 10^-6) { + unit <- 0L + multiple <- 10^9 * multiple + } + if(unit == 3L & multiple < 10^-3) { + unit <- 1L + multiple <- 10^6 * multiple + } + if(unit == 3L & multiple < 1) { + unit <- 2L + multiple <- 10^3 * multiple + } + + multiple <- as.integer(multiple) + } + + # more special cases: lubridate imposes sensible maximum + # values on the number of seconds, minutes and hours + if(unit == 3L & multiple > 60) { + abort("Rounding with second > 60 is not supported") + } + if(unit == 4L & multiple > 60) { + abort("Rounding with minute > 60 is not supported") Review comment: How about we allow such cases (minute > 60)? C++ implementation supports them and I can imagine I'd want to round to e.g. 90 minutes or 36 hours sometimes. Users would probably not mind as we would not reduce capabilities. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org