dragosmg commented on code in PR #13196:
URL: https://github.com/apache/arrow/pull/13196#discussion_r905009795


##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -200,20 +236,193 @@ build_formats <- function(orders) {
   unique(formats)
 }
 
+#' Build formats from a single order
+#'
+#' @param order a single string date-time format, such as `"ymd"` or 
`"ymd_hms"`
+#'
+#' @return a vector of all possible formats derived from the input
+#' order
+#'
+#' @noRd
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",
+    "M" = "%M",
+    "S" = "%S",
+    "I" = "%I"
+  )
+
+  split_order <- strsplit(order, split = "")[[1]]
+
+  outcome <- expand.grid(char_list[split_order])
+  # we combine formats with and without the "-" separator, we will later
+  # coalesce through all of them (benchmarking indicated this is a more
+  # computationally efficient approach rather than figuring out if a string has
+  # separators or not and applying only )
+  # during parsing if the string to be parsed does not contain a separator
+  formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  c(formats_with_sep, formats_without_sep)
+}
+
+#' Process data in preparation for parsing
+#'
+#' `process_data_for_parsing()` takes a data column and a vector of `orders` 
and
+#' prepares several versions of the input data:
+#'   * `processed_x` is a version of `x` where all separators were replaced 
with
+#'  `"-"` and multiple separators were collapsed into a single one. This 
element
+#'  is only set to an empty list when the `orders` argument indicate we're only
+#'  interested in parsing the augmented version of `x`.
+#'  * each of the other 3 elements augment `x` in some way
+#'    * `augmented_x_ym` - builds the `ym` and `my` formats by adding `"01"`
+#'    (to indicate the first day of the month)
+#'    * `augmented_x_yq` - transforms the `yq` format to `ymd`, by deriving the
+#'    first month of the quarter and adding `"01"` to indicate the first day
+#'    * `augmented_x_qy` - transforms the `qy` format to `ymd` in a similar
+#'    manner to `"yq"`
+#'
+#' @param x an Expression corresponding to a character or numeric vector of
+#' dates to be parsed.
+#' @param orders a character vector of date-time formats.
+#'
+#' @return a list made up of 4 lists, each a different version of x:
+#'  * `processed_x`
+#'  * `augmented_x_ym`
+#'  * `augmented_x_yq`
+#'  * `augmented_x_qy`
+#' @noRd
+process_data_for_parsing <- function(x,
+                                     orders) {
+
+  processed_x <- x$cast(string())
+
+  # make all separators (non-letters and non-numbers) into "-"
+  processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+  # collapse multiple separators into a single one
+  processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+  # we need to transform `x` when orders are `ym`, `my`, and `yq`
+  # for `ym` and `my` orders we add a day ("01")
+  # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627
+  augmented_x_ym <- NULL
+  if (any(orders %in% c("ym", "my", "Ym", "mY"))) {
+    # add day as "-01" if there is a "-" separator and as "01" if not
+    augmented_x_ym <- call_binding(
+      "if_else",
+      call_binding("grepl", "-", processed_x),
+      call_binding("paste0", processed_x, "-01"),
+      call_binding("paste0", processed_x, "01")
+    )
+  }
+
+  # for `yq` we need to transform the quarter into the start month (lubridate
+  # behaviour) and then add 01 to parse to the first day of the quarter
+  augmented_x_yq <- NULL
+  if (any(orders %in% c("yq", "Yq"))) {
+    # extract everything that comes after the `-` separator, i.e. the quarter
+    # (e.g. 4 from 2022-4)
+    quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # we should probably error if quarter is not in 1:4
+    # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+    # in 2002-4)
+    year_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }
+
+  # same as for `yq`, we need to derive the month from the quarter and add a
+  # "01" to give us the first day of the month
+  augmented_x_qy <- NULL
+  if (any(orders %in% c("qy", "qY"))) {
+    quarter_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    year_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # year might be missing the final 0s when extracted from a float, hence the
+    # need to pad
+    year_x <- call_binding("str_pad", year_x, width = 4, side = "right", pad = 
"0")
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_qy <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }
+
+  list(
+    "augmented_x_ym" = augmented_x_ym,
+    "augmented_x_yq" = augmented_x_yq,
+    "augmented_x_qy" = augmented_x_qy,
+    "processed_x" = processed_x
+  )
+}
+
+
+#' Attempt parsing
+#'
+#' This function does several things:
+#'   * builds all possible `formats` from the supplied `orders`
+#'   * processes the data with `process_data_for_parsing()`
+#'   * build a list of the possible `strptime` Expressions for the data & 
formats
+#'   combinations
+#'
+#' @inheritParams process_data_for_parsing
+#'
+#' @return a list of `strptime` Expressions we can use with `coalesce`
+#' @noRd
+attempt_parsing <- function(x,
+                            orders) {

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to