This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 60f6caf9b1 ARROW-16516: [R] Implement ym() my() and yq() parsers
60f6caf9b1 is described below
commit 60f6caf9b19d145757baac553506d150720728b2
Author: Dragoș Moldovan-Grünfeld <[email protected]>
AuthorDate: Wed May 18 10:44:53 2022 +0100
ARROW-16516: [R] Implement ym() my() and yq() parsers
The `ym()`, `my()` and `yq()` bindings will make the following possible
(and identical):
``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
test_df <- tibble::tibble(
ym_string = c("2022-05", "2022/02", "22.03", NA)
)
test_df %>%
mutate(ym_date = ym(ym_string))
#> # A tibble: 4 × 2
#> ym_string ym_date
#> <chr> <date>
#> 1 2022-05 2022-05-01
#> 2 2022/02 2022-02-01
#> 3 22.03 2022-03-01
#> 4 <NA> NA
test_df %>%
arrow_table() %>%
mutate(ym_date = ym(ym_string)) %>%
collect()
#> # A tibble: 4 × 2
#> ym_string ym_date
#> <chr> <date>
#> 1 2022-05 2022-05-01
#> 2 2022/02 2022-02-01
#> 3 22.03 2022-03-01
#> 4 <NA> NA
```
<sup>Created on 2022-05-16 by the [reprex
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
I've implementing this with the following steps:
* add `"-01"` to the end of the strings we're trying to parse, and then
* use one the supported `orders` (`"ymd"` or `"myd"`)
Closes #13163 from dragosmg/ym_my_yq_parsers
Authored-by: Dragoș Moldovan-Grünfeld <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/dplyr-datetime-helpers.R | 23 +++++++-
r/R/dplyr-funcs-datetime.R | 84 ++++++++++++++++++++++++++--
r/tests/testthat/test-dplyr-funcs-datetime.R | 38 ++++++++++++-
3 files changed, 137 insertions(+), 8 deletions(-)
diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index 1855a4a46e..607104d7ce 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -159,6 +159,26 @@ build_formats <- function(orders) {
orders <- gsub("[^A-Za-z_]", "", orders)
orders <- gsub("Y", "y", orders)
+ # we separate "ym', "my", and "yq" from the rest of the `orders` vector and
+ # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd`
+ # this is needed for 2 reasons:
+ # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the
format,
+ # 2. for equivalence to lubridate, which parses `ym` to the first day of the
month
+ short_orders <- c("ym", "my")
+
+ if (any(orders %in% short_orders)) {
+ orders1 <- setdiff(orders, short_orders)
+ orders2 <- intersect(orders, short_orders)
+ orders2 <- paste0(orders2, "d")
+ orders <- unique(c(orders1, orders2))
+ }
+
+ if (any(orders == "yq")) {
+ orders1 <- setdiff(orders, "yq")
+ orders2 <- "ymd"
+ orders <- unique(c(orders1, orders2))
+ }
+
supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
unsupported_passed_orders <- setdiff(orders, supported_orders)
supported_passed_orders <- intersect(orders, supported_orders)
@@ -176,7 +196,8 @@ build_formats <- function(orders) {
}
formats_list <- map(orders, build_format_from_order)
- purrr::flatten_chr(formats_list)
+ formats <- purrr::flatten_chr(formats_list)
+ unique(formats)
}
build_format_from_order <- function(order) {
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index e0c65d64cc..02ec35bda2 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -493,27 +493,99 @@ register_bindings_datetime_parsers <- function() {
# each order is translated into possible formats
formats <- build_formats(orders)
+ x <- x$cast(string())
+
# make all separators (non-letters and non-numbers) into "-"
x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
# collapse multiple separators into a single one
x <- call_binding("gsub", "-{2,}", "-", x)
+ # we need to transform `x` when orders are `ym`, `my`, and `yq`
+ # for `ym` and `my` orders we add a day ("01")
+ augmented_x <- NULL
+ if (any(orders %in% c("ym", "my"))) {
+ augmented_x <- call_binding("paste0", x, "-01")
+ }
+
+ # for `yq` we need to transform the quarter into the start month (lubridate
+ # behaviour) and then add 01 to parse to the first day of the quarter
+ augmented_x2 <- NULL
+ if (any(orders == "yq")) {
+ # extract everything that comes after the `-` separator, i.e. the quarter
+ # (e.g. 4 from 2022-4)
+ quarter_x <- call_binding("gsub", "^.*?-", "", x)
+ # we should probably error if quarter is not in 1:4
+ # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+ # in 2002-4)
+ year_x <- call_binding("gsub", "-.*$", "", x)
+ quarter_x <- quarter_x$cast(int32())
+ month_x <- (quarter_x - 1) * 3 + 1
+ augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
+ }
+
# TODO figure out how to parse strings that have no separators
# https://issues.apache.org/jira/browse/ARROW-16446
# we could insert separators at the "likely" positions, but it might be
# tricky given the possible combinations between dmy formats + locale
# build a list of expressions for each format
- parse_attempt_expressions <- list()
-
- for (i in seq_along(formats)) {
- parse_attempt_expressions[[i]] <- build_expr(
+ parse_attempt_expressions <- map(
+ formats,
+ ~ build_expr(
"strptime",
x,
- options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
+ options = list(
+ format = .x,
+ unit = 0L,
+ error_is_null = TRUE
+ )
+ )
+ )
+
+ # build separate expression lists of parsing attempts for the orders that
+ # need an augmented `x`
+ # list for attempts when orders %in% c("ym", "my")
+ parse_attempt_exp_augmented_x <- list()
+
+ if (!is.null(augmented_x)) {
+ parse_attempt_exp_augmented_x <- map(
+ formats,
+ ~ build_expr(
+ "strptime",
+ augmented_x,
+ options = list(
+ format = .x,
+ unit = 0L,
+ error_is_null = TRUE
+ )
+ )
)
}
+ # list for attempts when orders %in% c("yq")
+ parse_attempt_exp_augmented_x2 <- list()
+ if (!is.null(augmented_x2)) {
+ parse_attempt_exp_augmented_x2 <- map(
+ formats,
+ ~ build_expr(
+ "strptime",
+ augmented_x2,
+ options = list(
+ format = .x,
+ unit = 0L,
+ error_is_null = TRUE
+ )
+ )
+ )
+ }
+
+ # combine all attempts expressions in prep for coalesce
+ parse_attempt_expressions <- c(
+ parse_attempt_expressions,
+ parse_attempt_exp_augmented_x,
+ parse_attempt_exp_augmented_x2
+ )
+
coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)
# we need this binding to be able to handle a NULL `tz`, which will then be
@@ -527,7 +599,7 @@ register_bindings_datetime_parsers <- function() {
})
- ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my",
"yq")
ymd_parser_map_factory <- function(order) {
force(order)
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R
b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 42448e8243..b122363015 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1735,7 +1735,7 @@ test_that("parse_date_time() doesn't work with hour,
minutes, and second compone
)
})
-test_that("year, month, day date/time parsers work", {
+test_that("year, month, day date/time parsers", {
test_df <- tibble::tibble(
ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"),
ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"),
@@ -1776,3 +1776,39 @@ test_that("year, month, day date/time parsers work", {
test_df
)
})
+
+test_that("ym, my & yq parsers", {
+ test_df <- tibble::tibble(
+ ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA),
+ my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA),
+ yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA),
+ yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA),
+ yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA)
+ )
+
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ym_date = ym(ym_string),
+ ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"),
+ my_date = my(my_string),
+ my_datetime = my(my_string, tz = "Pacific/Marquesas"),
+ yq_date_from_string = yq(yq_string),
+ yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"),
+ yq_date_from_numeric = yq(yq_numeric),
+ yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"),
+ yq_date_from_string_with_space = yq(yq_space),
+ yq_datetime_from_string_with_space = yq(yq_space, tz =
"Pacific/Marquesas"),
+ ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")),
+ my_date2 = parse_date_time(my_string, orders = c("my", "myd")),
+ yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"),
+ yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"),
+ yq_date_from_string_with_space2 = parse_date_time(yq_space, orders =
"yq")
+ ) %>%
+ collect(),
+ test_df
+ )
+})