jonkeane commented on code in PR #13196:
URL: https://github.com/apache/arrow/pull/13196#discussion_r879766587
##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -179,7 +179,24 @@ build_formats <- function(orders) {
orders <- unique(c(orders1, orders2))
}
- supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ ymd_hms_orders <- c(
+ "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
+ "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
+ )
+ # support "%I" hour formats
+ ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)
+
+ supported_orders <- c(
+ ymd_orders,
+ ymd_hms_orders,
+ gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as separators
Review Comment:
Do we need an additional set of formats for `" "` and `"_"`? I thought that
we were converging the formats down to one seaprator (and doing the same to the
input columns|strings) such that we only have one delimiter that we need to
deal with. Am I missing something here?
##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +218,100 @@ build_formats <- function(orders) {
}
build_format_from_order <- function(order) {
- year_chars <- c("%y", "%Y")
- month_chars <- c("%m", "%B", "%b")
- day_chars <- "%d"
-
- outcome <- switch(
- order,
- "ymd" = expand.grid(year_chars, month_chars, day_chars),
- "ydm" = expand.grid(year_chars, day_chars, month_chars),
- "mdy" = expand.grid(month_chars, day_chars, year_chars),
- "myd" = expand.grid(month_chars, year_chars, day_chars),
- "dmy" = expand.grid(day_chars, month_chars, year_chars),
- "dym" = expand.grid(day_chars, year_chars, month_chars)
+ char_list <- list(
+ "y" = c("%y", "%Y"),
+ "m" = c("%m", "%B", "%b"),
+ "d" = "%d",
+ "H" = "%H",
+ "M" = "%M",
+ "S" = "%S",
+ "I" = "%I"
+ )
+
+ split_order <- strsplit(order, split = "")[[1]]
+
+ outcome <- expand.grid(char_list[split_order])
+ formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+ formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+ c(formats_with_sep, formats_without_sep)
+}
+
+process_data_for_parsing <- function(x,
+ orders) {
+
+ processed_x <- x$cast(string())
+
+ # make all separators (non-letters and non-numbers) into "-"
+ processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+ # collapse multiple separators into a single one
+ processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+ # we need to transform `x` when orders are `ym`, `my`, and `yq`
+ # for `ym` and `my` orders we add a day ("01")
+ # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627
+ augmented_x_ym <- NULL
+ if (any(orders %in% c("ym", "my"))) {
+ # add day as "-01" if there is a "-" separator and as "01" if not
+ augmented_x_ym <- call_binding(
+ "if_else",
+ call_binding("grepl", "-", processed_x),
+ call_binding("paste0", processed_x, "-01"),
+ call_binding("paste0", processed_x, "01")
+ )
+ }
+
+ # for `yq` we need to transform the quarter into the start month (lubridate
+ # behaviour) and then add 01 to parse to the first day of the quarter
+ augmented_x_yq <- NULL
+ if (any(orders == "yq")) {
+ # extract everything that comes after the `-` separator, i.e. the quarter
+ # (e.g. 4 from 2022-4)
+ quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+ # we should probably error if quarter is not in 1:4
+ # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+ # in 2002-4)
+ year_x <- call_binding("gsub", "-.*$", "", processed_x)
+ quarter_x <- quarter_x$cast(int32())
+ month_x <- (quarter_x - 1) * 3 + 1
+ augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+ }
+
+ list(
+ "augmented_x_ym" = augmented_x_ym,
+ "augmented_x_yq" = augmented_x_yq,
+ "processed_x" = processed_x
+ )
+}
+
+attempt_parsing <- function(x,
+ orders) {
+ # translate orders into possible formats
+ formats <- build_formats(orders)
+
+ processed_data <- process_data_for_parsing(x, orders)
+
+ parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs,
formats)
Review Comment:
Could you add some more comments describing what are in each of these
variables?
##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +218,100 @@ build_formats <- function(orders) {
}
build_format_from_order <- function(order) {
- year_chars <- c("%y", "%Y")
- month_chars <- c("%m", "%B", "%b")
- day_chars <- "%d"
-
- outcome <- switch(
- order,
- "ymd" = expand.grid(year_chars, month_chars, day_chars),
- "ydm" = expand.grid(year_chars, day_chars, month_chars),
- "mdy" = expand.grid(month_chars, day_chars, year_chars),
- "myd" = expand.grid(month_chars, year_chars, day_chars),
- "dmy" = expand.grid(day_chars, month_chars, year_chars),
- "dym" = expand.grid(day_chars, year_chars, month_chars)
+ char_list <- list(
+ "y" = c("%y", "%Y"),
+ "m" = c("%m", "%B", "%b"),
+ "d" = "%d",
+ "H" = "%H",
+ "M" = "%M",
+ "S" = "%S",
+ "I" = "%I"
+ )
+
+ split_order <- strsplit(order, split = "")[[1]]
+
+ outcome <- expand.grid(char_list[split_order])
+ formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+ formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+ c(formats_with_sep, formats_without_sep)
+}
+
+process_data_for_parsing <- function(x,
+ orders) {
+
+ processed_x <- x$cast(string())
+
+ # make all separators (non-letters and non-numbers) into "-"
+ processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+ # collapse multiple separators into a single one
+ processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+ # we need to transform `x` when orders are `ym`, `my`, and `yq`
+ # for `ym` and `my` orders we add a day ("01")
+ # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627
+ augmented_x_ym <- NULL
+ if (any(orders %in% c("ym", "my"))) {
+ # add day as "-01" if there is a "-" separator and as "01" if not
+ augmented_x_ym <- call_binding(
+ "if_else",
+ call_binding("grepl", "-", processed_x),
+ call_binding("paste0", processed_x, "-01"),
+ call_binding("paste0", processed_x, "01")
+ )
+ }
+
+ # for `yq` we need to transform the quarter into the start month (lubridate
+ # behaviour) and then add 01 to parse to the first day of the quarter
+ augmented_x_yq <- NULL
+ if (any(orders == "yq")) {
+ # extract everything that comes after the `-` separator, i.e. the quarter
+ # (e.g. 4 from 2022-4)
+ quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+ # we should probably error if quarter is not in 1:4
+ # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+ # in 2002-4)
+ year_x <- call_binding("gsub", "-.*$", "", processed_x)
+ quarter_x <- quarter_x$cast(int32())
+ month_x <- (quarter_x - 1) * 3 + 1
+ augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+ }
Review Comment:
Do we have tests for the format `qy`?
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -488,105 +488,28 @@ register_bindings_duration_helpers <- function() {
register_bindings_datetime_parsers <- function() {
register_binding("parse_date_time", function(x,
orders,
- tz = "UTC") {
-
- # each order is translated into possible formats
- formats <- build_formats(orders)
-
- x <- x$cast(string())
-
- # make all separators (non-letters and non-numbers) into "-"
- x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
- # collapse multiple separators into a single one
- x <- call_binding("gsub", "-{2,}", "-", x)
-
- # we need to transform `x` when orders are `ym`, `my`, and `yq`
- # for `ym` and `my` orders we add a day ("01")
- augmented_x <- NULL
- if (any(orders %in% c("ym", "my"))) {
- augmented_x <- call_binding("paste0", x, "-01")
- }
-
- # for `yq` we need to transform the quarter into the start month (lubridate
- # behaviour) and then add 01 to parse to the first day of the quarter
- augmented_x2 <- NULL
- if (any(orders == "yq")) {
- # extract everything that comes after the `-` separator, i.e. the quarter
- # (e.g. 4 from 2022-4)
- quarter_x <- call_binding("gsub", "^.*?-", "", x)
- # we should probably error if quarter is not in 1:4
- # extract everything that comes before the `-`, i.e. the year (e.g. 2002
- # in 2002-4)
- year_x <- call_binding("gsub", "-.*$", "", x)
- quarter_x <- quarter_x$cast(int32())
- month_x <- (quarter_x - 1) * 3 + 1
- augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
- }
-
- # TODO figure out how to parse strings that have no separators
- # https://issues.apache.org/jira/browse/ARROW-16446
- # we could insert separators at the "likely" positions, but it might be
- # tricky given the possible combinations between dmy formats + locale
-
- # build a list of expressions for each format
- parse_attempt_expressions <- map(
- formats,
- ~ build_expr(
- "strptime",
- x,
- options = list(
- format = .x,
- unit = 0L,
- error_is_null = TRUE
- )
- )
- )
-
- # build separate expression lists of parsing attempts for the orders that
- # need an augmented `x`
- # list for attempts when orders %in% c("ym", "my")
- parse_attempt_exp_augmented_x <- list()
-
- if (!is.null(augmented_x)) {
- parse_attempt_exp_augmented_x <- map(
- formats,
- ~ build_expr(
- "strptime",
- augmented_x,
- options = list(
- format = .x,
- unit = 0L,
- error_is_null = TRUE
- )
- )
- )
+ tz = "UTC",
+ truncated = 0,
+ quiet = TRUE,
+ exact = FALSE) {
+ if (!quiet) {
+ arrow_not_supported("`quiet = FALSE`")
}
- # list for attempts when orders %in% c("yq")
- parse_attempt_exp_augmented_x2 <- list()
- if (!is.null(augmented_x2)) {
- parse_attempt_exp_augmented_x2 <- map(
- formats,
- ~ build_expr(
- "strptime",
- augmented_x2,
- options = list(
- format = .x,
- unit = 0L,
- error_is_null = TRUE
- )
- )
- )
+ if (truncated != 0) {
+ # build several orders for truncated formats
+ orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop =
nchar(orders) - .x))
Review Comment:
Does this match the lubridate behavior for `truncate`? If not, is it
possible to match that behavior? If you haven't already, you might want to look
at the source of lubridate and see how it's done there
##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -179,7 +179,24 @@ build_formats <- function(orders) {
orders <- unique(c(orders1, orders2))
}
- supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
+ ymd_hms_orders <- c(
+ "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
+ "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
+ )
+ # support "%I" hour formats
+ ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)
Review Comment:
Is this ok to do? I thought `H` was 24-hour hours and `I` was 12? Do we have
a test for using H with 24-hour style hours?
##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +218,100 @@ build_formats <- function(orders) {
}
build_format_from_order <- function(order) {
- year_chars <- c("%y", "%Y")
- month_chars <- c("%m", "%B", "%b")
- day_chars <- "%d"
-
- outcome <- switch(
- order,
- "ymd" = expand.grid(year_chars, month_chars, day_chars),
- "ydm" = expand.grid(year_chars, day_chars, month_chars),
- "mdy" = expand.grid(month_chars, day_chars, year_chars),
- "myd" = expand.grid(month_chars, year_chars, day_chars),
- "dmy" = expand.grid(day_chars, month_chars, year_chars),
- "dym" = expand.grid(day_chars, year_chars, month_chars)
+ char_list <- list(
+ "y" = c("%y", "%Y"),
+ "m" = c("%m", "%B", "%b"),
+ "d" = "%d",
+ "H" = "%H",
+ "M" = "%M",
+ "S" = "%S",
+ "I" = "%I"
+ )
+
+ split_order <- strsplit(order, split = "")[[1]]
+
+ outcome <- expand.grid(char_list[split_order])
+ formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+ formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+ c(formats_with_sep, formats_without_sep)
+}
+
+process_data_for_parsing <- function(x,
+ orders) {
+
+ processed_x <- x$cast(string())
+
+ # make all separators (non-letters and non-numbers) into "-"
+ processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+ # collapse multiple separators into a single one
+ processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+ # we need to transform `x` when orders are `ym`, `my`, and `yq`
+ # for `ym` and `my` orders we add a day ("01")
+ # TODO revisit after https://issues.apache.org/jira/browse/ARROW-16627
+ augmented_x_ym <- NULL
+ if (any(orders %in% c("ym", "my"))) {
+ # add day as "-01" if there is a "-" separator and as "01" if not
+ augmented_x_ym <- call_binding(
+ "if_else",
+ call_binding("grepl", "-", processed_x),
+ call_binding("paste0", processed_x, "-01"),
+ call_binding("paste0", processed_x, "01")
+ )
+ }
+
+ # for `yq` we need to transform the quarter into the start month (lubridate
+ # behaviour) and then add 01 to parse to the first day of the quarter
+ augmented_x_yq <- NULL
+ if (any(orders == "yq")) {
+ # extract everything that comes after the `-` separator, i.e. the quarter
+ # (e.g. 4 from 2022-4)
+ quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+ # we should probably error if quarter is not in 1:4
+ # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+ # in 2002-4)
+ year_x <- call_binding("gsub", "-.*$", "", processed_x)
+ quarter_x <- quarter_x$cast(int32())
+ month_x <- (quarter_x - 1) * 3 + 1
+ augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+ }
+
+ list(
+ "augmented_x_ym" = augmented_x_ym,
+ "augmented_x_yq" = augmented_x_yq,
+ "processed_x" = processed_x
+ )
+}
+
+attempt_parsing <- function(x,
+ orders) {
+ # translate orders into possible formats
+ formats <- build_formats(orders)
+
+ processed_data <- process_data_for_parsing(x, orders)
+
+ parse_attempt_exprs_list <- map(processed_data, build_strptime_exprs,
formats)
+
+ # if all orders are in c("ym", "my", "yq") only attempt to parse the
augmented_x
+ if (all(orders %in% c("ym", "my", "yq"))) {
+ parse_attempt_exprs_list$processed_x <- list()
+ }
+
+ purrr::flatten(parse_attempt_exprs_list)
Review Comment:
What's the structure here that needs flattening? I'm not saying it's wrong —
I'm just not following from the code above why this is necesary.
##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +213,130 @@ build_formats <- function(orders) {
}
build_format_from_order <- function(order) {
- year_chars <- c("%y", "%Y")
- month_chars <- c("%m", "%B", "%b")
- day_chars <- "%d"
-
- outcome <- switch(
- order,
- "ymd" = expand.grid(year_chars, month_chars, day_chars),
- "ydm" = expand.grid(year_chars, day_chars, month_chars),
- "mdy" = expand.grid(month_chars, day_chars, year_chars),
- "myd" = expand.grid(month_chars, year_chars, day_chars),
- "dmy" = expand.grid(day_chars, month_chars, year_chars),
- "dym" = expand.grid(day_chars, year_chars, month_chars)
+ char_list <- list(
+ "y" = c("%y", "%Y"),
+ "m" = c("%m", "%B", "%b"),
+ "d" = "%d",
+ "H" = "%H",
+ "M" = "%M",
+ "S" = "%S"
+ )
+
+ split_order <- strsplit(order, split = "")[[1]]
+
+ outcome <- expand.grid(char_list[split_order])
+ formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+ formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+ c(formats_with_sep, formats_without_sep)
Review Comment:
Right, you would need to add that `if_else()` to the expressions. This will
require this code be in a slightly different location to take be added in the
right spot (though you should be able to construct a test to see what kinds of
performance impacts there are for running 1x orders versus 2x in this case).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]