[arrow] branch master updated: ARROW-16516: [R] Implement ym() my() and yq() parsers

thisisnic Wed, 18 May 2022 02:45:16 -0700

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 60f6caf9b1 ARROW-16516: [R] Implement ym() my() and yq() parsers
60f6caf9b1 is described below

commit 60f6caf9b19d145757baac553506d150720728b2
Author: Dragoș Moldovan-Grünfeld <[email protected]>
AuthorDate: Wed May 18 10:44:53 2022 +0100

    ARROW-16516: [R] Implement ym() my() and yq() parsers
    
    The `ym()`, `my()` and `yq()` bindings will make the following possible 
(and identical):
    
    ``` r
    library(arrow, warn.conflicts = FALSE)
    library(dplyr, warn.conflicts = FALSE)
    library(lubridate, warn.conflicts = FALSE)
    
    test_df <- tibble::tibble(
      ym_string = c("2022-05", "2022/02", "22.03", NA)
    )
    
    test_df %>%
      mutate(ym_date = ym(ym_string))
    #> # A tibble: 4 × 2
    #>   ym_string ym_date
    #>   <chr>     <date>
    #> 1 2022-05   2022-05-01
    #> 2 2022/02   2022-02-01
    #> 3 22.03     2022-03-01
    #> 4 <NA>      NA
    
    test_df %>%
      arrow_table() %>%
      mutate(ym_date = ym(ym_string)) %>%
      collect()
    #> # A tibble: 4 × 2
    #>   ym_string ym_date
    #>   <chr>     <date>
    #> 1 2022-05   2022-05-01
    #> 2 2022/02   2022-02-01
    #> 3 22.03     2022-03-01
    #> 4 <NA>      NA
    ```
    
    <sup>Created on 2022-05-16 by the [reprex 
package](https://reprex.tidyverse.org) (v2.0.1)</sup>
    
    I've implementing this with the following steps:
    * add `"-01"` to the end of the strings we're trying to parse, and then
    * use one the supported `orders` (`"ymd"` or `"myd"`)
    
    Closes #13163 from dragosmg/ym_my_yq_parsers
    
    Authored-by: Dragoș Moldovan-Grünfeld <[email protected]>
    Signed-off-by: Nic Crane <[email protected]>
---
 r/R/dplyr-datetime-helpers.R                 | 23 +++++++-
 r/R/dplyr-funcs-datetime.R                   | 84 ++++++++++++++++++++++++++--
 r/tests/testthat/test-dplyr-funcs-datetime.R | 38 ++++++++++++-
 3 files changed, 137 insertions(+), 8 deletions(-)

diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index 1855a4a46e..607104d7ce 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -159,6 +159,26 @@ build_formats <- function(orders) {
   orders <- gsub("[^A-Za-z_]", "", orders)
   orders <- gsub("Y", "y", orders)
 
+  # we separate "ym', "my", and "yq" from the rest of the `orders` vector and
+  # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd`
+  # this is needed for 2 reasons:
+  # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the 
format,
+  # 2. for equivalence to lubridate, which parses `ym` to the first day of the 
month
+  short_orders <- c("ym", "my")
+
+  if (any(orders %in% short_orders)) {
+    orders1 <- setdiff(orders, short_orders)
+    orders2 <- intersect(orders, short_orders)
+    orders2 <- paste0(orders2, "d")
+    orders <- unique(c(orders1, orders2))
+  }
+
+  if (any(orders == "yq")) {
+    orders1 <- setdiff(orders, "yq")
+    orders2 <- "ymd"
+    orders <- unique(c(orders1, orders2))
+  }
+
   supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
   unsupported_passed_orders <- setdiff(orders, supported_orders)
   supported_passed_orders <- intersect(orders, supported_orders)
@@ -176,7 +196,8 @@ build_formats <- function(orders) {
   }
 
   formats_list <- map(orders, build_format_from_order)
-  purrr::flatten_chr(formats_list)
+  formats <- purrr::flatten_chr(formats_list)
+  unique(formats)
 }
 
 build_format_from_order <- function(order) {
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index e0c65d64cc..02ec35bda2 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -493,27 +493,99 @@ register_bindings_datetime_parsers <- function() {
     # each order is translated into possible formats
     formats <- build_formats(orders)
 
+    x <- x$cast(string())
+
     # make all separators (non-letters and non-numbers) into "-"
     x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
     # collapse multiple separators into a single one
     x <- call_binding("gsub", "-{2,}", "-", x)
 
+    # we need to transform `x` when orders are `ym`, `my`, and `yq`
+    # for `ym` and `my` orders we add a day ("01")
+    augmented_x <- NULL
+    if (any(orders %in% c("ym", "my"))) {
+      augmented_x <- call_binding("paste0", x, "-01")
+    }
+
+    # for `yq` we need to transform the quarter into the start month (lubridate
+    # behaviour) and then add 01 to parse to the first day of the quarter
+    augmented_x2 <- NULL
+    if (any(orders == "yq")) {
+      # extract everything that comes after the `-` separator, i.e. the quarter
+      # (e.g. 4 from 2022-4)
+      quarter_x <- call_binding("gsub", "^.*?-", "", x)
+      # we should probably error if quarter is not in 1:4
+      # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+      # in 2002-4)
+      year_x <- call_binding("gsub", "-.*$", "", x)
+      quarter_x <- quarter_x$cast(int32())
+      month_x <- (quarter_x - 1) * 3 + 1
+      augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
+    }
+
     # TODO figure out how to parse strings that have no separators
     # https://issues.apache.org/jira/browse/ARROW-16446
     # we could insert separators at the "likely" positions, but it might be
     # tricky given the possible combinations between dmy formats + locale
 
     # build a list of expressions for each format
-    parse_attempt_expressions <- list()
-
-    for (i in seq_along(formats)) {
-      parse_attempt_expressions[[i]] <- build_expr(
+    parse_attempt_expressions <- map(
+      formats,
+      ~ build_expr(
         "strptime",
         x,
-        options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
+        options = list(
+          format = .x,
+          unit = 0L,
+          error_is_null = TRUE
+        )
+      )
+    )
+
+    # build separate expression lists of parsing attempts for the orders that
+    # need an augmented `x`
+    # list for attempts when orders %in% c("ym", "my")
+    parse_attempt_exp_augmented_x <- list()
+
+    if (!is.null(augmented_x)) {
+      parse_attempt_exp_augmented_x <- map(
+        formats,
+        ~ build_expr(
+          "strptime",
+          augmented_x,
+          options = list(
+            format = .x,
+            unit = 0L,
+            error_is_null = TRUE
+          )
+        )
       )
     }
 
+    # list for attempts when orders %in% c("yq")
+    parse_attempt_exp_augmented_x2 <- list()
+    if (!is.null(augmented_x2)) {
+      parse_attempt_exp_augmented_x2 <- map(
+        formats,
+        ~ build_expr(
+          "strptime",
+          augmented_x2,
+          options = list(
+            format = .x,
+            unit = 0L,
+            error_is_null = TRUE
+          )
+        )
+      )
+    }
+
+    # combine all attempts expressions in prep for coalesce
+    parse_attempt_expressions <- c(
+      parse_attempt_expressions,
+      parse_attempt_exp_augmented_x,
+      parse_attempt_exp_augmented_x2
+    )
+
     coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)
 
     # we need this binding to be able to handle a NULL `tz`, which will then be
@@ -527,7 +599,7 @@ register_bindings_datetime_parsers <- function() {
 
   })
 
-  ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+  ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", 
"yq")
 
   ymd_parser_map_factory <- function(order) {
     force(order)
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R 
b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 42448e8243..b122363015 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1735,7 +1735,7 @@ test_that("parse_date_time() doesn't work with hour, 
minutes, and second compone
   )
 })
 
-test_that("year, month, day date/time parsers work", {
+test_that("year, month, day date/time parsers", {
   test_df <- tibble::tibble(
     ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"),
     ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"),
@@ -1776,3 +1776,39 @@ test_that("year, month, day date/time parsers work", {
     test_df
   )
 })
+
+test_that("ym, my & yq parsers", {
+  test_df <- tibble::tibble(
+    ym_string = c("2022-05", "2022/02", "22.03", "1979//12", "88.09", NA),
+    my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA),
+    yq_string = c("2007.3", "1970.2", "2020.1", "2009.4", "1975.1", NA),
+    yq_numeric = c(2007.3, 1970.2, 2020.1, 2009.4, 1975.1, NA),
+    yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA)
+  )
+
+  # these functions' internals use some string processing which requires the
+  # RE2 library (not available on Windows with R 3.6)
+  skip_if_not_available("re2")
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        ym_date = ym(ym_string),
+        ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"),
+        my_date = my(my_string),
+        my_datetime = my(my_string, tz = "Pacific/Marquesas"),
+        yq_date_from_string = yq(yq_string),
+        yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"),
+        yq_date_from_numeric = yq(yq_numeric),
+        yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"),
+        yq_date_from_string_with_space = yq(yq_space),
+        yq_datetime_from_string_with_space = yq(yq_space, tz = 
"Pacific/Marquesas"),
+        ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")),
+        my_date2 = parse_date_time(my_string, orders = c("my", "myd")),
+        yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"),
+        yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"),
+        yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = 
"yq")
+      ) %>%
+      collect(),
+    test_df
+  )
+})

[arrow] branch master updated: ARROW-16516: [R] Implement ym() my() and yq() parsers

Reply via email to