jonkeane commented on a change in pull request #12433:
URL: https://github.com/apache/arrow/pull/12433#discussion_r818756056



##########
File path: r/R/dplyr-funcs-type.R
##########
@@ -76,6 +76,48 @@ register_bindings_type_cast <- function() {
   register_binding("as.numeric", function(x) {
     Expression$create("cast", x, options = cast_options(to_type = float64()))
   })
+  register_binding("as.Date", function(x,
+                                       format = NULL,
+                                       tryFormats = "%Y-%m-%d",
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
+
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not 
supported in Arrow")
+    }

Review comment:
       This is a great approach, thanks for this!

##########
File path: r/tests/testthat/test-dplyr-funcs-type.R
##########
@@ -767,4 +768,81 @@ test_that("nested structs can be created from scalars and 
existing data frames",
       collect(),
     tibble(a = 1:2)
   )
+
+  })
+
+test_that("as.Date() converts successfully from date, timestamp, integer, char 
and double", {
+  test_df <- tibble::tibble(
+    posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"),
+    date_var = as.Date("2022-02-25"),
+    character_ymd_var = "2022-02-25 00:00:01",
+    character_ydm_var = "2022/25/02 00:00:01",
+    integer_var = 32L,
+    double_var = 34.56
+  )
+
+  # casting from POSIXct treated separately so we can skip on Windows
+  # TODO move the test for casting from POSIXct below once ARROW-13168 is done
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_dv = as.Date(date_var),
+        date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d 
%H:%M:%S"),
+        date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m 
%H:%M:%S"),
+        date_int = as.Date(integer_var, origin = "1970-01-01")
+      ) %>%
+      collect(),
+    test_df
+  )
+
+  # currently we do not support an origin different to "1970-01-01"
+  expect_warning(
+    test_df %>%
+      arrow_table() %>%
+      mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>%
+      collect(),
+    regexp = "`as.Date()` with an `origin` different than '1970-01-01' is not 
supported in Arrow",
+    fixed = TRUE
+  )

Review comment:
       Could this (and the few other `expect_warnings()` below) use the form:
   
   ```
     compare_dplyr_binding(
        .input %>%
           arrow_table() %>%
           mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>%
           collect(),
         test_df,
         warning = TRUE
       )
   ```
   
   Which we use elsewhere to show that this is a "not support in Arrow, pulling 
into R". If you wanted to put the specific error in a comment above, that would 
also be fine to show _why_ it's being pulled into R for folks who are reading 
the tests.

##########
File path: r/R/dplyr-funcs-type.R
##########
@@ -76,6 +76,48 @@ register_bindings_type_cast <- function() {
   register_binding("as.numeric", function(x) {
     Expression$create("cast", x, options = cast_options(to_type = float64()))
   })
+  register_binding("as.Date", function(x,
+                                       format = NULL,
+                                       tryFormats = "%Y-%m-%d",
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
+
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not 
supported in Arrow")
+    }
+
+    # this could be improved with tryFormats once strptime returns NA and we
+    # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659
+    # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is 
done
+    if (is.null(format) && length(tryFormats) > 1) {
+      abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
+    }
+
+    if (call_binding("is.Date", x)) {
+      return(x)
+
+    # cast from POSIXct
+    } else if (call_binding("is.POSIXct", x)) {
+      # base::as.Date() first converts to the desired timezone and then 
extracts
+      # the date, which is why we need to go through timestamp() first
+      x <- build_expr("cast", x, options = cast_options(to_type = 
timestamp(timezone = tz)))

Review comment:
       Very nice comment

##########
File path: r/R/dplyr-funcs-type.R
##########
@@ -76,6 +76,48 @@ register_bindings_type_cast <- function() {
   register_binding("as.numeric", function(x) {
     Expression$create("cast", x, options = cast_options(to_type = float64()))
   })
+  register_binding("as.Date", function(x,
+                                       format = NULL,
+                                       tryFormats = "%Y-%m-%d",
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
+
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not 
supported in Arrow")
+    }
+
+    # this could be improved with tryFormats once strptime returns NA and we
+    # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659
+    # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is 
done
+    if (is.null(format) && length(tryFormats) > 1) {
+      abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
+    }
+
+    if (call_binding("is.Date", x)) {
+      return(x)
+
+    # cast from POSIXct
+    } else if (call_binding("is.POSIXct", x)) {
+      # base::as.Date() first converts to the desired timezone and then 
extracts
+      # the date, which is why we need to go through timestamp() first
+      x <- build_expr("cast", x, options = cast_options(to_type = 
timestamp(timezone = tz)))
+
+    # cast from character
+    } else if (call_binding("is.character", x)) {
+      format <- format %||% tryFormats[[1]]
+      x <- build_expr("strptime", x, options = list(format = format, unit = 
0L))

Review comment:
       It might be nice to have a comment above here that explains what `unit = 
0L` means in human terms just so that someone doesn't have to go looking for 
it. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to