dragosmg commented on code in PR #13196: URL: https://github.com/apache/arrow/pull/13196#discussion_r880569374
########## r/R/dplyr-datetime-helpers.R: ########## @@ -201,19 +218,100 @@ build_formats <- function(orders) { } build_format_from_order <- function(order) { - year_chars <- c("%y", "%Y") - month_chars <- c("%m", "%B", "%b") - day_chars <- "%d" - - outcome <- switch( - order, - "ymd" = expand.grid(year_chars, month_chars, day_chars), - "ydm" = expand.grid(year_chars, day_chars, month_chars), - "mdy" = expand.grid(month_chars, day_chars, year_chars), - "myd" = expand.grid(month_chars, year_chars, day_chars), - "dmy" = expand.grid(day_chars, month_chars, year_chars), - "dym" = expand.grid(day_chars, year_chars, month_chars) + char_list <- list( + "y" = c("%y", "%Y"), + "m" = c("%m", "%B", "%b"), + "d" = "%d", + "H" = "%H", + "M" = "%M", + "S" = "%S", + "I" = "%I" + ) + + split_order <- strsplit(order, split = "")[[1]] + + outcome <- expand.grid(char_list[split_order]) + formats_with_sep <- do.call(paste, c(outcome, sep = "-")) + formats_without_sep <- do.call(paste, c(outcome, sep = "")) + c(formats_with_sep, formats_without_sep) +} + +process_data_for_parsing <- function(x, + orders) { + + processed_x <- x$cast(string()) + + # make all separators (non-letters and non-numbers) into "-" + processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x) + # collapse multiple separators into a single one + processed_x <- call_binding("gsub", "-{2,}", "-", processed_x) + + # we need to transform `x` when orders are `ym`, `my`, and `yq` + # for `ym` and `my` orders we add a day ("01") + # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627 + augmented_x_ym <- NULL + if (any(orders %in% c("ym", "my"))) { + # add day as "-01" if there is a "-" separator and as "01" if not + augmented_x_ym <- call_binding( + "if_else", + call_binding("grepl", "-", processed_x), + call_binding("paste0", processed_x, "-01"), + call_binding("paste0", processed_x, "01") + ) + } + + # for `yq` we need to transform the quarter into the start month (lubridate + # behaviour) and then add 01 to parse to the first day of the quarter + augmented_x_yq <- NULL + if (any(orders == "yq")) { + # extract everything that comes after the `-` separator, i.e. the quarter + # (e.g. 4 from 2022-4) + quarter_x <- call_binding("gsub", "^.*?-", "", processed_x) + # we should probably error if quarter is not in 1:4 + # extract everything that comes before the `-`, i.e. the year (e.g. 2002 + # in 2002-4) + year_x <- call_binding("gsub", "-.*$", "", processed_x) + quarter_x <- quarter_x$cast(int32()) + month_x <- (quarter_x - 1) * 3 + 1 + augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01") + } + + list( + "augmented_x_ym" = augmented_x_ym, + "augmented_x_yq" = augmented_x_yq, + "processed_x" = processed_x + ) +} + +attempt_parsing <- function(x, + orders) { + # translate orders into possible formats + formats <- build_formats(orders) + + processed_data <- process_data_for_parsing(x, orders) + + parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs, formats) + + # if all orders are in c("ym", "my", "yq") only attempt to parse the augmented_x + if (all(orders %in% c("ym", "my", "yq"))) { + parse_attempt_exprs_list$processed_x <- list() + } + + purrr::flatten(parse_attempt_exprs_list) Review Comment: It's a list of lists of expressions (one list of `Expressions` for variant of `x`). We need to remove one level in order for the resulting `list` to play nicely with `build_expr("coalesce", args = list)` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org