jonkeane commented on code in PR #12738:
URL: https://github.com/apache/arrow/pull/12738#discussion_r855473384
##########
r/tests/testthat/test-dplyr-funcs-type.R:
##########
@@ -845,6 +854,15 @@ test_that("as.Date() converts successfully from date,
timestamp, integer, char a
warning = TRUE
)
+ # strptime does not support a partial format -
+ # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813
+ expect_error(
+ test_df %>%
+ arrow_table() %>%
+ mutate(date_char_ymd = as_date(character_ymd_var)) %>%
+ collect()
+ )
Review Comment:
Is the error here coming from C++ only (and is that why you're not asserting
what the error is)? If yes to both: can we add something to the comment. We
should be clear (to our future selves) why we are going against our pattern
that one should always assert what the error is when using `expect_error`.
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -463,3 +463,56 @@ duration_from_chunks <- function(chunks) {
}
duration
}
+
+binding_as_date <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d",
+ origin = "1970-01-01") {
+
+ if (is.null(format) && length(tryFormats) > 1) {
+ abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
+ }
+
+ if (call_binding("is.Date", x)) {
+ return(x)
+
+ # cast from character
+ } else if (call_binding("is.character", x)) {
+ x <- binding_as_date_character(x, format, tryFormats)
+
+ # cast from numeric
+ } else if (call_binding("is.numeric", x)) {
+ x <- binding_as_date_numeric(x, origin)
+ }
+
+ build_expr("cast", x, options = cast_options(to_type = date32()))
+}
+
+binding_as_date_character <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d") {
+ format <- format %||% tryFormats[[1]]
+ # unit = 0L is the identifier for seconds in valid_time32_units
+ build_expr("strptime", x, options = list(format = format, unit = 0L))
+}
+
+binding_as_date_numeric <- function(x,
+ origin = "1970-01-01") {
Review Comment:
```suggestion
binding_as_date_numeric <- function(x, origin = "1970-01-01") {
```
Minor style nitpick, this could all to on one line, it's quite short still.
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -463,3 +463,56 @@ duration_from_chunks <- function(chunks) {
}
duration
}
+
+binding_as_date <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d",
+ origin = "1970-01-01") {
+
+ if (is.null(format) && length(tryFormats) > 1) {
+ abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
+ }
+
+ if (call_binding("is.Date", x)) {
+ return(x)
+
+ # cast from character
+ } else if (call_binding("is.character", x)) {
+ x <- binding_as_date_character(x, format, tryFormats)
+
+ # cast from numeric
+ } else if (call_binding("is.numeric", x)) {
+ x <- binding_as_date_numeric(x, origin)
+ }
+
+ build_expr("cast", x, options = cast_options(to_type = date32()))
Review Comment:
This + the helper functions below are fantastic! Makes it so much easier to
isolate "which of this is about the type?" and "which of this is about the
vagaries of `as.Date`?".
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -463,3 +463,56 @@ duration_from_chunks <- function(chunks) {
}
duration
}
+
+binding_as_date <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d",
+ origin = "1970-01-01") {
+
+ if (is.null(format) && length(tryFormats) > 1) {
+ abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
+ }
+
+ if (call_binding("is.Date", x)) {
+ return(x)
+
+ # cast from character
+ } else if (call_binding("is.character", x)) {
+ x <- binding_as_date_character(x, format, tryFormats)
+
+ # cast from numeric
+ } else if (call_binding("is.numeric", x)) {
+ x <- binding_as_date_numeric(x, origin)
+ }
+
+ build_expr("cast", x, options = cast_options(to_type = date32()))
+}
+
+binding_as_date_character <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d") {
+ format <- format %||% tryFormats[[1]]
+ # unit = 0L is the identifier for seconds in valid_time32_units
+ build_expr("strptime", x, options = list(format = format, unit = 0L))
+}
+
+binding_as_date_numeric <- function(x,
+ origin = "1970-01-01") {
+
+ # Arrow does not support direct casting from double to date32(), but for
+ # integer-like values we can go via int32()
+ # https://issues.apache.org/jira/browse/ARROW-15798
+ # TODO revisit if arrow decides to support double -> date casting
+ if (!call_binding("is.integer", x)) {
+ x <- build_expr("cast", x, options = cast_options(to_type = int32()))
+ }
+
+ if (origin != "1970-01-01") {
+ delta_in_sec <- call_binding("difftime", origin, "1970-01-01")
+ delta_in_sec <- build_expr("cast", delta_in_sec, options =
cast_options(to_type = int64()))
Review Comment:
Would you mind adding the issue that I presume you created for int32 ->
duration here?
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -463,3 +463,56 @@ duration_from_chunks <- function(chunks) {
}
duration
}
+
+binding_as_date <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d",
+ origin = "1970-01-01") {
Review Comment:
I don't think we should change this here now, but something to think about
and maybe do in the future if we have similar setups like this: It would
probably be easier to pass the args down using `...` that way we don't always
have to write out all of the arguments if we were to add one.
Like I said, let's not change this now, but wanted to suggest that.
##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -463,3 +463,56 @@ duration_from_chunks <- function(chunks) {
}
duration
}
+
+binding_as_date <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d",
+ origin = "1970-01-01") {
+
+ if (is.null(format) && length(tryFormats) > 1) {
+ abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
+ }
+
+ if (call_binding("is.Date", x)) {
+ return(x)
+
+ # cast from character
+ } else if (call_binding("is.character", x)) {
+ x <- binding_as_date_character(x, format, tryFormats)
+
+ # cast from numeric
+ } else if (call_binding("is.numeric", x)) {
+ x <- binding_as_date_numeric(x, origin)
+ }
+
+ build_expr("cast", x, options = cast_options(to_type = date32()))
+}
+
+binding_as_date_character <- function(x,
+ format = NULL,
+ tryFormats = "%Y-%m-%d") {
+ format <- format %||% tryFormats[[1]]
+ # unit = 0L is the identifier for seconds in valid_time32_units
+ build_expr("strptime", x, options = list(format = format, unit = 0L))
+}
+
+binding_as_date_numeric <- function(x,
+ origin = "1970-01-01") {
+
+ # Arrow does not support direct casting from double to date32(), but for
+ # integer-like values we can go via int32()
+ # https://issues.apache.org/jira/browse/ARROW-15798
+ # TODO revisit if arrow decides to support double -> date casting
+ if (!call_binding("is.integer", x)) {
+ x <- build_expr("cast", x, options = cast_options(to_type = int32()))
+ }
+
+ if (origin != "1970-01-01") {
+ delta_in_sec <- call_binding("difftime", origin, "1970-01-01")
+ delta_in_sec <- build_expr("cast", delta_in_sec, options =
cast_options(to_type = int64()))
Review Comment:
And also(?) or only(?) the issue you created for making the wrapper we
talked about on https://github.com/apache/arrow/pull/12855
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]