This is an automated email from the ASF dual-hosted git repository.
amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 10bb61804d ARROW-16407: [R] Extend `parse_date_time` to cover hour,
dates, and minutes components (#13196)
10bb61804d is described below
commit 10bb61804d0eccd9715d4aa372018443236899b5
Author: Dragoș Moldovan-Grünfeld <[email protected]>
AuthorDate: Wed Jun 29 15:08:31 2022 +0100
ARROW-16407: [R] Extend `parse_date_time` to cover hour, dates, and minutes
components (#13196)
This PR improves `parse_date_time()` by:
* adding support for orders with the hours, minutes, and seconds components
* adding support for unseparated strings
([ARROW-16446](https://issues.apache.org/jira/browse/ARROW-16446))
* supporting the `exact` argument:
* allows users to pass `exact = TRUE` in which case the `orders` are
taken as they are (they are considered `formats` and passed to `strptime`)
* `exact = FALSE` implies `formats` are derived from `orders`
* allowing the `truncated` argument
* denotes number of formats that might be missing. For example, passing
an `order` like `ymd_HMS` and a value of 1 for `truncated` will attempt parsing
with both `ymd_HMS` and `ymd_HM` orders
* erroring when the user passes `quiet = FALSE`
* improves the utility function used to generate `formats` (which are then
passed on to `strptime`) from `orders`
* less hard-coding and increased ability to deal with different orders
and separators
the `ymd HMS` orders (and variants) will parse correctly:
``` r
library(dplyr, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
test_df <- tibble(
x = c("2011-12-31 12:59:59", "2010-01-01 12:11", "2010-01-01 12",
"2010-01-01")
)
test_df %>%
mutate(
y = parse_date_time(x, "Ymd HMS", truncated = 3)
)
#> # A tibble: 4 × 2
#> x y
#> <chr> <dttm>
#> 1 2011-12-31 12:59:59 2011-12-31 12:59:59
#> 2 2010-01-01 12:11 2010-01-01 12:11:00
#> 3 2010-01-01 12 2010-01-01 12:00:00
#> 4 2010-01-01 2010-01-01 00:00:00
test_df %>%
arrow_table() %>%
mutate(
y = parse_date_time(x, "Ymd HMS", truncated = 3)
) %>%
collect()
#> # A tibble: 4 × 2
#> x y
#> <chr> <dttm>
#> 1 2011-12-31 12:59:59 2011-12-31 12:59:59
#> 2 2010-01-01 12:11 2010-01-01 12:11:00
#> 3 2010-01-01 12 2010-01-01 12:00:00
#> 4 2010-01-01 2010-01-01 00:00:00
```
<sup>Created on 2022-05-19 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
`exact = TRUE` can also be used:
<details>
``` r
library(arrow, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
test_df <- tibble(
x = c("11/23/1998 07:00:00", "6/18/1952 0135", "2/25/1974 0523",
"9/07/1985 01", NA)
)
test_df %>%
mutate(
parsed_x =
parse_date_time(
x,
c("%m/%d/%Y %I:%M:%S", "%m/%d/%Y %H%M", "%m/%d/%Y %H"),
exact = TRUE
)
)
#> # A tibble: 5 × 2
#> x parsed_x
#> <chr> <dttm>
#> 1 11/23/1998 07:00:00 1998-11-23 07:00:00
#> 2 6/18/1952 0135 1952-06-18 01:35:00
#> 3 2/25/1974 0523 1974-02-25 05:23:00
#> 4 9/07/1985 01 1985-09-07 01:00:00
#> 5 <NA> NA
test_df %>%
arrow_table() %>%
mutate(
parsed_x =
parse_date_time(
x,
c("%m/%d/%Y %I:%M:%S", "%m/%d/%Y %H%M", "%m/%d/%Y %H"),
exact = TRUE
)
) %>%
collect()
#> # A tibble: 5 × 2
#> x parsed_x
#> <chr> <dttm>
#> 1 11/23/1998 07:00:00 1998-11-23 07:00:00
#> 2 6/18/1952 0135 1952-06-18 01:35:00
#> 3 2/25/1974 0523 1974-02-25 05:23:00
#> 4 9/07/1985 01 1985-09-07 01:00:00
#> 5 <NA> NA
```
<sup>Created on 2022-05-20 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
</details>
Authored-by: Dragoș Moldovan-Grünfeld <[email protected]>
Signed-off-by: Alessandro Molina <[email protected]>
---
r/NEWS.md | 3 +-
r/R/dplyr-datetime-helpers.R | 243 ++++++++++++++--
r/R/dplyr-funcs-datetime.R | 121 ++------
r/tests/testthat/test-dplyr-funcs-datetime.R | 411 ++++++++++++++++++++++++---
4 files changed, 626 insertions(+), 152 deletions(-)
diff --git a/r/NEWS.md b/r/NEWS.md
index 6d25aa2154..d88be22964 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -20,8 +20,7 @@
# arrow 8.0.0.9000
* `lubridate::parse_date_time()` datetime parser:
- * currently parses only `orders` with year, month, and day components. In a
future release `orders` support for other datetime components (such as hours,
minutes, seconds, etc) will be added.
- * strings with no separators (e.g. `"20210917"`) could be ambiguous and are
not yet supported.
+ * `orders` with year, month, day, hours, minutes, and seconds components are
supported.
* the `orders` argument in the Arrow binding works as follows: `orders` are
transformed into `formats` which subsequently get applied in turn. There is no
`select_formats` parameter and no inference takes place (like is the case in
`lubridate::parse_date_time()`).
# arrow 8.0.0
diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index bc1a13075d..60771c8624 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -152,11 +152,24 @@ binding_as_date_numeric <- function(x, origin =
"1970-01-01") {
x
}
+#' Build formats from multiple orders
+#'
+#' This function is a vectorised version of `build_format_from_order()`. In
+#' addition to `build_format_from_order()`, it also checks if the supplied
+#' orders are currently supported.
+#'
+#' @inheritParams process_data_for_parsing
+#'
+#' @return a vector of unique formats
+#'
+#' @noRd
build_formats <- function(orders) {
# only keep the letters and the underscore as separator -> allow the users to
- # pass strptime-like formats (with "%"). Processing is needed (instead of
passing
+ # pass strptime-like formats (with "%"). We process the data -> we need to
+ # process the `orders` (even if supplied in the desired format)
+ # Processing is needed (instead of passing
# formats as-is) due to the processing of the character vector in
parse_date_time()
- orders <- gsub("[^A-Za-z_]", "", orders)
+ orders <- gsub("[^A-Za-z]", "", orders)
orders <- gsub("Y", "y", orders)
# we separate "ym', "my", and "yq" from the rest of the `orders` vector and
@@ -170,7 +183,7 @@ build_formats <- function(orders) {
orders1 <- setdiff(orders, short_orders)
orders2 <- intersect(orders, short_orders)
orders2 <- paste0(orders2, "d")
- orders <- unique(c(orders1, orders2))
+ orders <- unique(c(orders2, orders1))
}
if (any(orders == "yq")) {
@@ -179,7 +192,30 @@ build_formats <- function(orders) {
orders <- unique(c(orders1, orders2))
}
- supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ if (any(orders == "qy")) {
+ orders1 <- setdiff(orders, "qy")
+ orders2 <- "ymd"
+ orders <- unique(c(orders1, orders2))
+ }
+
+ ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ ymd_hms_orders <- c(
+ "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
+ "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
+ )
+ # support "%I" hour formats
+ ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)
+
+ supported_orders <- c(
+ ymd_orders,
+ ymd_hms_orders,
+ gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators
+ gsub("_", "", ymd_hms_orders),
+ ymd_ims_orders,
+ gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators
+ gsub("_", "", ymd_ims_orders)
+ )
+
unsupported_passed_orders <- setdiff(orders, supported_orders)
supported_passed_orders <- intersect(orders, supported_orders)
@@ -200,20 +236,191 @@ build_formats <- function(orders) {
unique(formats)
}
+#' Build formats from a single order
+#'
+#' @param order a single string date-time format, such as `"ymd"` or
`"ymd_hms"`
+#'
+#' @return a vector of all possible formats derived from the input
+#' order
+#'
+#' @noRd
build_format_from_order <- function(order) {
- year_chars <- c("%y", "%Y")
- month_chars <- c("%m", "%B", "%b")
- day_chars <- "%d"
-
- outcome <- switch(
- order,
- "ymd" = expand.grid(year_chars, month_chars, day_chars),
- "ydm" = expand.grid(year_chars, day_chars, month_chars),
- "mdy" = expand.grid(month_chars, day_chars, year_chars),
- "myd" = expand.grid(month_chars, year_chars, day_chars),
- "dmy" = expand.grid(day_chars, month_chars, year_chars),
- "dym" = expand.grid(day_chars, year_chars, month_chars)
+ char_list <- list(
+ "y" = c("%y", "%Y"),
+ "m" = c("%m", "%B", "%b"),
+ "d" = "%d",
+ "H" = "%H",
+ "M" = "%M",
+ "S" = "%S",
+ "I" = "%I"
+ )
+
+ split_order <- strsplit(order, split = "")[[1]]
+
+ outcome <- expand.grid(char_list[split_order])
+ # we combine formats with and without the "-" separator, we will later
+ # coalesce through all of them (benchmarking indicated this is a more
+ # computationally efficient approach rather than figuring out if a string has
+ # separators or not and applying only )
+ # during parsing if the string to be parsed does not contain a separator
+ formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+ formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+ c(formats_with_sep, formats_without_sep)
+}
+
+#' Process data in preparation for parsing
+#'
+#' `process_data_for_parsing()` takes a data column and a vector of `orders`
and
+#' prepares several versions of the input data:
+#' * `processed_x` is a version of `x` where all separators were replaced
with
+#' `"-"` and multiple separators were collapsed into a single one. This
element
+#' is only set to an empty list when the `orders` argument indicate we're only
+#' interested in parsing the augmented version of `x`.
+#' * each of the other 3 elements augment `x` in some way
+#' * `augmented_x_ym` - builds the `ym` and `my` formats by adding `"01"`
+#' (to indicate the first day of the month)
+#' * `augmented_x_yq` - transforms the `yq` format to `ymd`, by deriving the
+#' first month of the quarter and adding `"01"` to indicate the first day
+#' * `augmented_x_qy` - transforms the `qy` format to `ymd` in a similar
+#' manner to `"yq"`
+#'
+#' @param x an Expression corresponding to a character or numeric vector of
+#' dates to be parsed.
+#' @param orders a character vector of date-time formats.
+#'
+#' @return a list made up of 4 lists, each a different version of x:
+#' * `processed_x`
+#' * `augmented_x_ym`
+#' * `augmented_x_yq`
+#' * `augmented_x_qy`
+#' @noRd
+process_data_for_parsing <- function(x, orders) {
+
+ processed_x <- x$cast(string())
+
+ # make all separators (non-letters and non-numbers) into "-"
+ processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+ # collapse multiple separators into a single one
+ processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+ # we need to transform `x` when orders are `ym`, `my`, and `yq`
+ # for `ym` and `my` orders we add a day ("01")
+ # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627
+ augmented_x_ym <- NULL
+ if (any(orders %in% c("ym", "my", "Ym", "mY"))) {
+ # add day as "-01" if there is a "-" separator and as "01" if not
+ augmented_x_ym <- call_binding(
+ "if_else",
+ call_binding("grepl", "-", processed_x),
+ call_binding("paste0", processed_x, "-01"),
+ call_binding("paste0", processed_x, "01")
+ )
+ }
+
+ # for `yq` we need to transform the quarter into the start month (lubridate
+ # behaviour) and then add 01 to parse to the first day of the quarter
+ augmented_x_yq <- NULL
+ if (any(orders %in% c("yq", "Yq"))) {
+ # extract everything that comes after the `-` separator, i.e. the quarter
+ # (e.g. 4 from 2022-4)
+ quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+ # we should probably error if quarter is not in 1:4
+ # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+ # in 2002-4)
+ year_x <- call_binding("gsub", "-.*$", "", processed_x)
+ quarter_x <- quarter_x$cast(int32())
+ month_x <- (quarter_x - 1) * 3 + 1
+ augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+ }
+
+ # same as for `yq`, we need to derive the month from the quarter and add a
+ # "01" to give us the first day of the month
+ augmented_x_qy <- NULL
+ if (any(orders %in% c("qy", "qY"))) {
+ quarter_x <- call_binding("gsub", "-.*$", "", processed_x)
+ quarter_x <- quarter_x$cast(int32())
+ year_x <- call_binding("gsub", "^.*?-", "", processed_x)
+ # year might be missing the final 0s when extracted from a float, hence the
+ # need to pad
+ year_x <- call_binding("str_pad", year_x, width = 4, side = "right", pad =
"0")
+ month_x <- (quarter_x - 1) * 3 + 1
+ augmented_x_qy <- call_binding("paste0", year_x, "-", month_x, "-01")
+ }
+
+ list(
+ "augmented_x_ym" = augmented_x_ym,
+ "augmented_x_yq" = augmented_x_yq,
+ "augmented_x_qy" = augmented_x_qy,
+ "processed_x" = processed_x
+ )
+}
+
+
+#' Attempt parsing
+#'
+#' This function does several things:
+#' * builds all possible `formats` from the supplied `orders`
+#' * processes the data with `process_data_for_parsing()`
+#' * build a list of the possible `strptime` Expressions for the data &
formats
+#' combinations
+#'
+#' @inheritParams process_data_for_parsing
+#'
+#' @return a list of `strptime` Expressions we can use with `coalesce`
+#' @noRd
+attempt_parsing <- function(x, orders) {
+ # translate orders into possible formats
+ formats <- build_formats(orders)
+
+ # depending on the orders argument we need to do some processing to the input
+ # data. `process_data_for_parsing()` uses the passed `orders` and not the
+ # derived `formats`
+ processed_data <- process_data_for_parsing(x, orders)
+
+ # build a list of expressions for parsing each processed_data element and
+ # format combination
+ parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs,
formats)
+
+ # if all orders are in c("ym", "my", "yq", "qy") only attempt to parse the
+ # augmented version(s) of x
+ if (all(orders %in% c("ym", "Ym", "my", "mY", "yq", "Yq", "qy", "qY"))) {
+ parse_attempt_exprs_list$processed_x <- list()
+ }
+
+ # we need the output to be a list of expressions (currently it is a list of
+ # lists of expressions due to the shape of the processed data. we have one
list
+ # of expressions for each element of/ list in processed_data) -> we need to
+ # remove a level of hierarchy from the list
+ purrr::flatten(parse_attempt_exprs_list)
+}
+
+#' Build `strptime` expressions
+#'
+#' This function takes several `formats`, iterates over them and builds a
+#' `strptime` Expression for each of them. Given these Expressions are
evaluated
+#' row-wise we can leverage this behaviour and introduce a condition. If `x`
has
+#' a separator, use the `format` as is, if it doesn't have a separator, remove
+#' the `"-"` separator from the `format`.
+#'
+#' @param x an Expression corresponding to a character or numeric vector of
+#' dates to be parsed.
+#' @param formats a character vector of formats as returned by
+#' `build_format_from_order`
+#'
+#' @return a list of Expressions
+#' @noRd
+build_strptime_exprs <- function(x, formats) {
+ # returning an empty list helps when iterating
+ if (is.null(x)) {
+ return(list())
+ }
+
+ map(
+ formats,
+ ~ build_expr(
+ "strptime",
+ x,
+ options = list(format = .x, unit = 0L, error_is_null = TRUE)
+ )
)
- outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-")
- outcome$format
}
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index bc0ddc4eed..8ecb80b6b4 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -520,109 +520,38 @@ register_bindings_duration_helpers <- function() {
register_bindings_datetime_parsers <- function() {
register_binding("parse_date_time", function(x,
orders,
- tz = "UTC") {
-
- # each order is translated into possible formats
- formats <- build_formats(orders)
-
- x <- x$cast(string())
-
- # make all separators (non-letters and non-numbers) into "-"
- x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
- # collapse multiple separators into a single one
- x <- call_binding("gsub", "-{2,}", "-", x)
-
- # we need to transform `x` when orders are `ym`, `my`, and `yq`
- # for `ym` and `my` orders we add a day ("01")
- augmented_x <- NULL
- if (any(orders %in% c("ym", "my"))) {
- augmented_x <- call_binding("paste0", x, "-01")
- }
-
- # for `yq` we need to transform the quarter into the start month (lubridate
- # behaviour) and then add 01 to parse to the first day of the quarter
- augmented_x2 <- NULL
- if (any(orders == "yq")) {
- # extract everything that comes after the `-` separator, i.e. the quarter
- # (e.g. 4 from 2022-4)
- quarter_x <- call_binding("gsub", "^.*?-", "", x)
- # we should probably error if quarter is not in 1:4
- # extract everything that comes before the `-`, i.e. the year (e.g. 2002
- # in 2002-4)
- year_x <- call_binding("gsub", "-.*$", "", x)
- quarter_x <- quarter_x$cast(int32())
- month_x <- (quarter_x - 1) * 3 + 1
- augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
- }
-
- # TODO figure out how to parse strings that have no separators
- # https://issues.apache.org/jira/browse/ARROW-16446
- # we could insert separators at the "likely" positions, but it might be
- # tricky given the possible combinations between dmy formats + locale
-
- # build a list of expressions for each format
- parse_attempt_expressions <- map(
- formats,
- ~ build_expr(
- "strptime",
- x,
- options = list(
- format = .x,
- unit = 0L,
- error_is_null = TRUE
- )
- )
- )
+ tz = "UTC",
+ truncated = 0,
+ quiet = TRUE,
+ exact = FALSE) {
+ if (!quiet) {
+ arrow_not_supported("`quiet = FALSE`")
+ }
- # build separate expression lists of parsing attempts for the orders that
- # need an augmented `x`
- # list for attempts when orders %in% c("ym", "my")
- parse_attempt_exp_augmented_x <- list()
-
- if (!is.null(augmented_x)) {
- parse_attempt_exp_augmented_x <- map(
- formats,
- ~ build_expr(
- "strptime",
- augmented_x,
- options = list(
- format = .x,
- unit = 0L,
- error_is_null = TRUE
- )
- )
- )
+ if (truncated > 0) {
+ if (truncated > (nchar(orders) - 3)) {
+ arrow_not_supported(paste0("a value for `truncated` > ", nchar(orders)
- 3))
+ }
+ # build several orders for truncated formats
+ orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop =
nchar(orders) - .x))
}
- # list for attempts when orders %in% c("yq")
- parse_attempt_exp_augmented_x2 <- list()
- if (!is.null(augmented_x2)) {
- parse_attempt_exp_augmented_x2 <- map(
- formats,
- ~ build_expr(
- "strptime",
- augmented_x2,
- options = list(
- format = .x,
- unit = 0L,
- error_is_null = TRUE
- )
- )
- )
+ if (!inherits(x, "Expression")) {
+ x <- Expression$scalar(x)
}
- # combine all attempts expressions in prep for coalesce
- parse_attempt_expressions <- c(
- parse_attempt_expressions,
- parse_attempt_exp_augmented_x,
- parse_attempt_exp_augmented_x2
- )
+ if (exact == TRUE) {
+ # no data processing takes place & we don't derive formats
+ parse_attempts <- build_strptime_exprs(x, orders)
+ } else {
+ parse_attempts <- attempt_parsing(x, orders = orders)
+ }
- coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)
+ coalesce_output <- build_expr("coalesce", args = parse_attempts)
- # we need this binding to be able to handle a NULL `tz`, which will then be
- # used by bindings such as `ymd` to return, based on whether tz is NULL or
- # not, a date or timestamp
+ # we need this binding to be able to handle a NULL `tz`, which, in turn,
+ # will be used by bindings such as `ymd()` to return a date or timestamp,
+ # based on whether tz is NULL or not
if (!is.null(tz)) {
build_expr("assume_timezone", coalesce_output, options = list(timezone =
tz))
} else {
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R
b/r/tests/testthat/test-dplyr-funcs-datetime.R
index bcea41b052..15af0c9f8d 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1678,22 +1678,17 @@ test_that("parse_date_time() works with year, month,
and date components", {
string_ymd = c(
"2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5",
"2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10",
"21:09:11",
- # not yet working for strings with no separators, like "20210917",
"210918" or "2021Sep19
- # no separators and %b or %B are even more complicated (and they work
in
- # lubridate). not to mention locale
- NA
+ "20210912", "210913", NA
),
string_dmy = c(
"1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021",
"6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21",
"11:09:21",
- # not yet working for strings with no separators, like "10092021",
"100921",
- NA
+ "12092021", "130921", NA
),
string_mdy = c(
"09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021",
"09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21",
- # not yet working for strings with no separators, like "09102021",
"091021",
- NA
+ "09122021", "091321", NA
)
)
)
@@ -1711,13 +1706,16 @@ test_that("parse_date_time() works with year, month,
and date components", {
collect(),
tibble::tibble(
string_ymd = c(
- "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA
+ "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15",
+ "2021Sep16", NA
),
string_dmy = c(
- "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA
+ "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21",
+ "16Sep2021", NA
),
string_mdy = c(
- "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA
+ "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21",
+ "Sep1621", NA
)
)
)
@@ -1744,20 +1742,6 @@ test_that("parse_date_time() works with a mix of formats
and orders", {
)
})
-test_that("parse_date_time() doesn't work with hour, minutes, and second
components", {
- test_dates_times <- tibble(
- date_times = c("09-01-17 12:34:56", NA)
- )
-
- expect_warning(
- test_dates_times %>%
- arrow_table() %>%
- mutate(parsed_date_ymd = parse_date_time(date_times, orders =
"ymd_HMS")) %>%
- collect(),
- '"ymd_HMS" `orders` not supported in Arrow'
- )
-})
-
test_that("year, month, day date/time parsers", {
test_df <- tibble::tibble(
ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"),
@@ -1802,11 +1786,16 @@ test_that("year, month, day date/time parsers", {
test_that("ym, my & yq parsers", {
test_df <- tibble::tibble(
- ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA),
+ ym_string = c("2022-05", "2022/02", "22.3", "1979//12", "88.09", NA),
my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA),
+ Ym_string = c("2022-05", "2022/02", "2022.03", "1979//12", "1988.09", NA),
+ mY_string = c("05-2022", "02/2022", "03.2022", "12//1979", "09.1988", NA),
yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA),
yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA),
- yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA)
+ yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA),
+ qy_string = c("3.2007", "2.1970", "1.2020", "4.2009", "1.1975", NA),
+ qy_numeric = c(3.2007, 2.1970, 1.2020, 4.2009, 1.1975, NA),
+ qy_space = c("3 2007", "2 1970", "1 2020", "4 2009", "1 1975", NA)
)
# these functions' internals use some string processing which requires the
@@ -1817,8 +1806,12 @@ test_that("ym, my & yq parsers", {
mutate(
ym_date = ym(ym_string),
ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"),
+ Ym_date = ym(Ym_string),
+ Ym_datetime = ym(Ym_string, tz = "Pacific/Marquesas"),
my_date = my(my_string),
my_datetime = my(my_string, tz = "Pacific/Marquesas"),
+ mY_date = my(mY_string),
+ mY_datetime = my(mY_string, tz = "Pacific/Marquesas"),
yq_date_from_string = yq(yq_string),
yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"),
yq_date_from_numeric = yq(yq_numeric),
@@ -1827,9 +1820,23 @@ test_that("ym, my & yq parsers", {
yq_datetime_from_string_with_space = yq(yq_space, tz =
"Pacific/Marquesas"),
ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")),
my_date2 = parse_date_time(my_string, orders = c("my", "myd")),
+ Ym_date2 = parse_date_time(Ym_string, orders = c("Ym", "ymd")),
+ mY_date2 = parse_date_time(mY_string, orders = c("mY", "myd")),
yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"),
yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"),
- yq_date_from_string_with_space2 = parse_date_time(yq_space, orders =
"yq")
+ yq_date_from_string_with_space2 = parse_date_time(yq_space, orders =
"yq"),
+ # testing with Yq
+ yq_date_from_string3 = parse_date_time(yq_string, orders = "Yq"),
+ yq_date_from_numeric3 = parse_date_time(yq_numeric, orders = "Yq"),
+ yq_date_from_string_with_space3 = parse_date_time(yq_space, orders =
"Yq"),
+ # testing with qy
+ qy_date_from_string = parse_date_time(qy_string, orders = "qy"),
+ qy_date_from_numeric = parse_date_time(qy_numeric, orders = "qy"),
+ qy_date_from_string_with_space = parse_date_time(qy_space, orders =
"qy"),
+ # testing with qY
+ qy_date_from_string2 = parse_date_time(qy_string, orders = "qY"),
+ qy_date_from_numeric2 = parse_date_time(qy_numeric, orders = "qY"),
+ qy_date_from_string_with_space2 = parse_date_time(qy_space, orders =
"qY")
) %>%
collect(),
test_df
@@ -1851,9 +1858,7 @@ test_that("lubridate's fast_strptime", {
collect(),
tibble(
x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA)
- )#,
- # arrow does not preserve the `tzone` attribute
- # test ignore_attr = TRUE
+ )
)
# R object
@@ -1870,8 +1875,7 @@ test_that("lubridate's fast_strptime", {
collect(),
tibble(
x = c("2018-10-07 19:04:05", NA)
- )#,
- # test ignore_attr = TRUE
+ )
)
compare_dplyr_binding(
@@ -1890,6 +1894,10 @@ test_that("lubridate's fast_strptime", {
)
)
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+
compare_dplyr_binding(
.input %>%
mutate(
@@ -1923,9 +1931,7 @@ test_that("lubridate's fast_strptime", {
tibble(
x =
c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
- )#,
- # arrow does not preserve the `tzone` attribute
- # test ignore_attr = TRUE
+ )
)
# the arrow binding errors for a value different from 68L for `cutoff_2000`
@@ -1965,3 +1971,336 @@ test_that("lubridate's fast_strptime", {
collect()
)
})
+
+test_that("parse_date_time with hours, minutes and seconds components", {
+ test_dates_times <- tibble(
+ ymd_hms_string =
+ c("67-01-09 12:34:56", "1970-05-22 20:13:59", "870822201359", NA),
+ ymd_hm_string =
+ c("67-01-09 12:34", "1970-05-22 20:13", "8708222013", NA),
+ ymd_h_string =
+ c("67-01-09 12", "1970-05-22 20", "87082220", NA),
+ dmy_hms_string =
+ c("09-01-67 12:34:56", "22-05-1970 20:13:59", "220887201359", NA),
+ dmy_hm_string =
+ c("09-01-67 12:34", "22-05-1970 20:13", "2208872013", NA),
+ dmy_h_string =
+ c("09-01-67 12", "22-05-1970 20", "22088720", NA),
+ mdy_hms_string =
+ c("01-09-67 12:34:56", "05-22-1970 20:13:59", "082287201359", NA),
+ mdy_hm_string =
+ c("01-09-67 12:34", "05-22-1970 20:13", "0822872013", NA),
+ mdy_h_string =
+ c("01-09-67 12", "05-22-1970 20", "08228720", NA),
+ ydm_hms_string =
+ c("67-09-01 12:34:56", "1970-22-05 20:13:59", "872208201359", NA),
+ ydm_hm_string =
+ c("67-09-01 12:34", "1970-22-05 20:13", "8722082013", NA),
+ ydm_h_string =
+ c("67-09-01 12", "1970-22-05 20", "87220820", NA)
+ )
+ # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y)
+
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"),
+ ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"),
+ ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"),
+ dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"),
+ dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"),
+ dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"),
+ mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"),
+ mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"),
+ mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"),
+ ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"),
+ ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydmHM"),
+ ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydmH")
+ ) %>%
+ collect(),
+ test_dates_times
+ )
+
+ # parse_date_time with timezone
+ pm_tz <- "Pacific/Marquesas"
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS", tz
= pm_tz),
+ ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM", tz =
pm_tz),
+ ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H", tz =
pm_tz),
+ dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS", tz
= pm_tz),
+ dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM", tz =
pm_tz),
+ dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H", tz =
pm_tz),
+ mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS", tz
= pm_tz),
+ mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM", tz =
pm_tz),
+ mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H", tz =
pm_tz),
+ ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS", tz
= pm_tz),
+ ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM", tz =
pm_tz),
+ ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H", tz =
pm_tz)
+ ) %>%
+ collect(),
+ test_dates_times
+ )
+
+ # test ymd_ims
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_ims_dttm =
+ parse_date_time(
+ ymd_ims_string,
+ orders = "ymd_IMS",
+ # lubridate is chatty and will warn 1 format failed to parse
+ quiet = TRUE
+ )
+ ) %>%
+ collect(),
+ tibble(
+ ymd_ims_string =
+ c("67-01-09 9:34:56", "1970-05-22 10:13:59", "19870822171359", NA)
+ )
+ )
+})
+
+test_that("parse_date_time with month names and HMS", {
+ # locale (affecting "%b% and "%B" formats) does not work properly on Windows
+ # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done
+ skip_on_os("windows")
+ test_dates_times2 <- tibble(
+ ymd_hms_string =
+ c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", "87Aug22201359", NA),
+ ymd_hm_string =
+ c("67-Jan-09 12:34", "1970-June-22 20:13", "87Aug222013", NA),
+ ymd_h_string =
+ c("67-Jan-09 12", "1970-June-22 20", "87Aug2220", NA),
+ dmy_hms_string =
+ c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", "22Aug87201359", NA),
+ dmy_hm_string =
+ c("09-Jan-67 12:34", "22-June-1970 20:13", "22Aug872013", NA),
+ dmy_h_string =
+ c("09-Jan-67 12", "22-June-1970 20", "22Aug8720", NA),
+ mdy_hms_string =
+ c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", "Aug2287201359", NA),
+ mdy_hm_string =
+ c("Jan-09-67 12:34", "June-22-1970 20:13", "Aug22872013", NA),
+ mdy_h_string =
+ c("Jan-09-67 12", "June-22-1970 20", "Aug228720", NA),
+ ydm_hms_string =
+ c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", "8722Aug201359", NA),
+ ydm_hm_string =
+ c("67-09-Jan 12:34", "1970-22-June 20:13", "8722Aug2013", NA),
+ ydm_h_string =
+ c("67-09-Jan 12", "1970-22-June 20", "8722Aug20", NA)
+ )
+ # the un-separated strings are versions of "1987-08-22 20:13:59" (with %y)
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"),
+ ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymdHM"),
+ ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"),
+ dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"),
+ dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmyHM"),
+ dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"),
+ mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"),
+ mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdyHM"),
+ mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"),
+ ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"),
+ ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydmHM"),
+ ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H")
+ ) %>%
+ collect(),
+ test_dates_times2
+ )
+})
+
+test_that("parse_date_time with `quiet = FALSE` not supported", {
+ # we need expect_warning twice as both the arrow pipeline (because quiet =
+ # FALSE is not supported) and the fallback dplyr/lubridate one throw
+ # warnings (the lubridate one because quiet is FALSE)
+ expect_warning(
+ expect_warning(
+ tibble(x = c("2022-05-19 13:46:51")) %>%
+ arrow_table() %>%
+ mutate(
+ x_dttm = parse_date_time(x, orders = "dmy_HMS", quiet = FALSE)
+ ) %>%
+ collect(),
+ "`quiet = FALSE` not supported in Arrow"
+ ),
+ "All formats failed to parse"
+ )
+})
+
+test_that("parse_date_time with truncated formats", {
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+
+ test_truncation_df <- tibble(
+ truncated_ymd_string =
+ c(
+ "2022-05-19 13:46:51",
+ "2022-05-18 13:46",
+ "2022-05-17 13",
+ "2022-05-16"
+ )
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ dttm =
+ parse_date_time(
+ truncated_ymd_string,
+ orders = "ymd_HMS",
+ truncated = 3
+ )
+ ) %>%
+ collect(),
+ test_truncation_df
+ )
+
+ # values for truncated greater than nchar(orders) - 3 not supported in Arrow
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ dttm =
+ parse_date_time(
+ truncated_ymd_string,
+ orders = "ymd_HMS",
+ truncated = 5
+ )
+ ) %>%
+ collect(),
+ test_truncation_df,
+ warning = "a value for `truncated` > 4 not supported in Arrow"
+ )
+})
+
+test_that("parse_date_time with `exact = TRUE`, and with regular R objects", {
+ test_df <- tibble(
+ x = c("2022-12-31 12:59:59", "2022-01-01 12:11", "2022-01-01 12",
"2022-01-01", NA),
+ y = c("11/23/1998 07:00:00", "6/18/1952 0135", "2/25/1974 0523",
"9/07/1985 01", NA)
+ )
+
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_x =
+ parse_date_time(
+ x,
+ c("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d %H",
"%Y-%m-%d"),
+ exact = TRUE
+ ),
+ parsed_y =
+ parse_date_time(
+ y,
+ c("%m/%d/%Y %I:%M:%S", "%m/%d/%Y %H%M", "%m/%d/%Y %H"),
+ exact = TRUE
+ )
+ ) %>%
+ collect(),
+ test_df
+ )
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ b = parse_date_time("2022-12-31 12:59:59", orders = "ymd_HMS")
+ ) %>%
+ collect(),
+ tibble(
+ a = 1
+ )
+ )
+})
+
+test_that("build_formats() and build_format_from_order()", {
+ expect_equal(
+ build_formats(c("ym", "myd", "%Y-%d-%m")),
+ c(
+ # formats from "ym" order
+ "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
+ "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d",
+ # formats from "myd" order
+ "%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d",
+ "%m%y%d", "%B%y%d", "%b%y%d", "%m%Y%d", "%B%Y%d", "%b%Y%d",
+ # formats from "%Y-%d-%m" format
+ "%y-%d-%m", "%Y-%d-%m", "%y-%d-%B", "%Y-%d-%B", "%y-%d-%b", "%Y-%d-%b",
+ "%y%d%m", "%Y%d%m", "%y%d%B", "%Y%d%B", "%y%d%b", "%Y%d%b")
+ )
+
+ expect_equal(
+ build_formats("ymd_HMS"),
+ c("%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
+ "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
+ "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
+ "%y%b%d%H%M%S", "%Y%b%d%H%M%S")
+ )
+
+ # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd"
+ # or "ydm" and the formats are built accordingly
+ ymd_formats <- c(
+ "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
+ "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d")
+ expect_equal(
+ build_formats("yq"),
+ ymd_formats
+ )
+
+ expect_equal(
+ build_formats("ym"),
+ ymd_formats
+ )
+
+ expect_equal(
+ build_formats("qy"),
+ ymd_formats
+ )
+
+ # build formats will output unique formats
+ expect_equal(
+ build_formats(c("yq", "ym", "qy")),
+ ymd_formats
+ )
+
+ expect_equal(
+ build_formats("my"),
+ c("%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d",
+ "%m%y%d", "%B%y%d", "%b%y%d", "%m%Y%d", "%B%Y%d", "%b%Y%d")
+ )
+
+ # ab not supported yet
+ expect_error(
+ build_formats("abd"),
+ '"abd" `orders` not supported in Arrow'
+ )
+
+ expect_error(
+ build_formats("vup"),
+ '"vup" `orders` not supported in Arrow'
+ )
+
+ expect_equal(
+ build_format_from_order("ymd"),
+ c("%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
+ "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d")
+ )
+
+ expect_equal(
+ build_format_from_order("ymdHMS"),
+ c("%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
+ "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
+ "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
+ "%y%b%d%H%M%S", "%Y%b%d%H%M%S")
+ )
+})