[GitHub] [arrow] nealrichardson commented on a change in pull request #8947: ARROW-9187: [R] Add bindings for arithmetic kernels

GitBox Wed, 30 Dec 2020 11:08:42 -0800


nealrichardson commented on a change in pull request #8947:
URL: https://github.com/apache/arrow/pull/8947#discussion_r550293049




##########
File path: r/R/expression.R
##########
@@ -59,6 +59,44 @@ build_array_expression <- function(.Generic, e1, e2, ...) {
   } else {
     e1 <- .wrap_arrow(e1, .Generic, e2$type)
     e2 <- .wrap_arrow(e2, .Generic, e1$type)
+
+    # In Arrow, "divide" is one function, which does integer division on
+    # integer inputs and floating-point division on floats
+    if (.Generic == "/") {
+      # TODO: omg so many ways it's wrong to assume these types
+      e1 <- e1$cast(float64())

Review comment:
       Don't `$cast()` here because that evaluates. This should still be 
`array_expression(...)`, though you could write a `cast_array_expression()` 
helper that is like `Expression$cast()` if you want.

##########
File path: r/R/expression.R
##########
@@ -153,104 +200,82 @@ print.array_expression <- function(x, ...) {
 #' `Expression$field_ref(name)` is used to construct an `Expression` which
 #' evaluates to the named column in the `Dataset` against which it is 
evaluated.
 #'
-#' `Expression$compare(OP, e1, e2)` takes two `Expression` operands, 
constructing
-#' an `Expression` which will evaluate these operands then compare them with 
the
-#' relation specified by OP (e.g. "==", "!=", ">", etc.) For example, to filter
-#' down to rows where the column named "alpha" is less than 5:
-#' `Expression$compare("<", Expression$field_ref("alpha"), 
Expression$scalar(5))`
-#'
-#' `Expression$and(e1, e2)`, `Expression$or(e1, e2)`, and `Expression$not(e1)`
-#' construct an `Expression` combining their arguments with Boolean operators.
-#'
-#' `Expression$is_valid(x)` is essentially (an inversion of) `is.na()` for 
`Expression`s.
-#'
-#' `Expression$in_(x, set)` evaluates x and returns whether or not it is a 
member of the set.
+#' `Expression$create(function_name, ..., options)` builds a function-call
+#' `Expression` containing one or more `Expression`s.
 #' @name Expression
 #' @rdname Expression
 #' @export
 Expression <- R6Class("Expression", inherit = ArrowObject,
   public = list(
-    ToString = function() dataset___expr__ToString(self)
+    ToString = function() dataset___expr__ToString(self),
+    cast = function(to_type, ...) {
+      Expression$create("cast", self, options = list(to_type = to_type, ...))
+    }
   )
 )
-
+Expression$create <- function(function_name,
+                              ...,
+                              args = list(...),
+                              options = empty_named_list()) {
+  assert_that(is.string(function_name))
+  dataset___expr__call(function_name, args, options)
+}
 Expression$field_ref <- function(name) {
-  assert_is(name, "character")
-  assert_that(length(name) == 1)
+  assert_that(is.string(name))
   dataset___expr__field_ref(name)
 }
 Expression$scalar <- function(x) {
   dataset___expr__scalar(Scalar$create(x))
 }
-Expression$compare <- function(OP, e1, e2) {
-  comp_func <- comparison_function_map[[OP]]
-  if (is.null(comp_func)) {
-    stop(OP, " is not a supported comparison function", call. = FALSE)
-  }
-  comp_func(e1, e2)
-}
 
-comparison_function_map <- list(
-  "==" = dataset___expr__equal,
-  "!=" = dataset___expr__not_equal,
-  ">" = dataset___expr__greater,
-  ">=" = dataset___expr__greater_equal,
-  "<" = dataset___expr__less,
-  "<=" = dataset___expr__less_equal
-)
-Expression$in_ <- function(x, set) {
-  dataset___expr__in(x, Array$create(set))
-}
-Expression$and <- function(e1, e2) {
-  dataset___expr__and(e1, e2)
-}
-Expression$or <- function(e1, e2) {
-  dataset___expr__or(e1, e2)
-}
-Expression$not <- function(e1) {
-  dataset___expr__not(e1)
-}
-Expression$is_valid <- function(e1) {
-  dataset___expr__is_valid(e1)
+build_dataset_expression <- function(.Generic, e1, e2, ...) {
+  if (.Generic %in% names(.unary_function_map)) {
+    expr <- Expression$create(.unary_function_map[[.Generic]], e1)
+  } else if (.Generic == "%in%") {
+    # Special-case %in%, which is different from the Array function name
+    expr <- Expression$create("is_in", e1,
+      options = list(
+        value_set = Array$create(e2),
+        skip_nulls = TRUE
+      )
+    )
+  } else {
+    if (!inherits(e1, "Expression")) {
+      e1 <- Expression$scalar(e1)
+    }
+    if (!inherits(e2, "Expression")) {
+      e2 <- Expression$scalar(e2)
+    }
+
+    # In Arrow, "divide" is one function, which does integer division on
+    # integer inputs and floating-point division on floats
+    if (.Generic == "/") {
+      # TODO: omg so many ways it's wrong to assume these types
+      e1 <- e1$cast(float64())
+      e2 <- e2$cast(float64())
+    } else if (.Generic == "%/%") {
+      # In R, integer division works like floor(float division)
+      out <- build_dataset_expression("/", e1, e2)
+      return(out$cast(int32(), allow_float_truncate = TRUE))
+    } else if (.Generic == "%%") {
+      # TODO: need to do something with types to ensure that e2 is compatible

Review comment:
       Is this TODO valid?

##########
File path: r/R/expression.R
##########
@@ -59,6 +59,44 @@ build_array_expression <- function(.Generic, e1, e2, ...) {
   } else {
     e1 <- .wrap_arrow(e1, .Generic, e2$type)
     e2 <- .wrap_arrow(e2, .Generic, e1$type)
+
+    # In Arrow, "divide" is one function, which does integer division on
+    # integer inputs and floating-point division on floats
+    if (.Generic == "/") {
+      # TODO: omg so many ways it's wrong to assume these types
+      e1 <- e1$cast(float64())
+      e2 <- e2$cast(float64())
+    } else if (.Generic == "%/%") {
+      return(array_expression("cast", 
array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = 
list(to_type = int32(), allow_float_truncate = TRUE)))
+    } else if (.Generic == "%%") {
+      # {e1 - e2 * ( e1 %/% e2 )}
+      # TODO: there has to be a way to use the form ^^^ instead of this.
+      # with return(e1 - e2 * (e1 %/% e2)) we get:
+      # "cannot add bindings to a locked environment"
+      out <- array_expression(
+        "subtract_checked", e1, array_expression(
+          "multiply_checked", e2, array_expression(
+            # this outer cast is to ensure that the result of this and the
+            # result of multiply are the same
+            "cast",
+            array_expression(
+              "cast",
+              array_expression(.binary_function_map[[.Generic]], e1, e2, ...),
+              options = list(to_type = int32(), allow_float_truncate = TRUE)
+            ),
+            options = list(to_type = e2$type, allow_float_truncate = TRUE)
+          )
+        )
+      )
+      return(out)
+    }
+
+    # hack to use subtract instead of subtract_checked for timestamps

Review comment:
       Why only subtract?
   
   And technically this could also be `else if` from above

##########
File path: r/R/expression.R
##########
@@ -59,6 +59,44 @@ build_array_expression <- function(.Generic, e1, e2, ...) {
   } else {
     e1 <- .wrap_arrow(e1, .Generic, e2$type)
     e2 <- .wrap_arrow(e2, .Generic, e1$type)
+
+    # In Arrow, "divide" is one function, which does integer division on
+    # integer inputs and floating-point division on floats
+    if (.Generic == "/") {
+      # TODO: omg so many ways it's wrong to assume these types
+      e1 <- e1$cast(float64())
+      e2 <- e2$cast(float64())
+    } else if (.Generic == "%/%") {
+      return(array_expression("cast", 
array_expression(.binary_function_map[[.Generic]], e1, e2, ...), options = 
list(to_type = int32(), allow_float_truncate = TRUE)))
+    } else if (.Generic == "%%") {
+      # {e1 - e2 * ( e1 %/% e2 )}
+      # TODO: there has to be a way to use the form ^^^ instead of this.
+      # with return(e1 - e2 * (e1 %/% e2)) we get:
+      # "cannot add bindings to a locked environment"

Review comment:
       I'll pull and see if I can figure out why this is problematic, though 
maybe it's solved by removing `$cast()` above

##########
File path: r/tests/testthat/test-dplyr.R
##########
@@ -133,6 +139,76 @@ test_that("filtering with expression", {
   )
 })
 
+test_that("filtering with arithmetic", {
+  expect_dplyr_equal(
+    input %>%
+      filter(dbl + 1 > 3) %>%
+      select(string = chr, int, dbl) %>%
+      collect(),
+    tbl
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      filter(dbl / 2 > 3) %>%
+      select(string = chr, int, dbl) %>%
+      collect(),
+    tbl
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      filter(dbl / 2L > 3) %>%
+      select(string = chr, int, dbl) %>%
+      collect(),
+    tbl
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      filter(int / 2 > 3) %>%
+      select(string = chr, int, dbl) %>%
+      collect(),
+    tbl
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      filter(int / 2L > 3) %>%
+      select(string = chr, int, dbl) %>%
+      collect(),
+    tbl
+  )
+
+  skip("autocasting should happen in compute kernels; R workaround fails on 
this ARROW-11078")

Review comment:
       Does this fail for the same reason that the comparisons below fail? `int 
/ 2L` is an integer but the `array_expression` doesn't know the resulting type, 
so it can't cast `3` to it?
   
   How does it fail? Gracefully or not?
   
   We could work around this either by tracking/guessing the resulting type and 
sticking it in the array_expression object or by deferring the autocasting 
until the expressions are evaluated (so we'll know the type of `int > 2L` when 
we evaluate `that > 3`), but it's probably not worth it now. Maybe make a jira 
for us to come back to?

##########
File path: r/tests/testthat/test-dataset.R
##########
@@ -494,12 +494,104 @@ test_that("filter() on date32 columns", {
   )
 })
 
+test_that("filter() with expressions", {
+  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+  expect_is(ds$format, "ParquetFileFormat")
+  expect_is(ds$filesystem, "LocalFileSystem")
+  expect_is(ds, "Dataset")
+  expect_equivalent(
+    ds %>%
+      select(chr, dbl) %>%
+      filter(dbl * 2 > 14 & dbl - 50 < 3L) %>%
+      collect() %>%
+      arrange(dbl),
+    rbind(
+      df1[8:10, c("chr", "dbl")],
+      df2[1:2, c("chr", "dbl")]
+    )
+  )
+
+  # check division's special casing.
+  expect_equivalent(
+    ds %>%
+      select(chr, dbl) %>%
+      filter(dbl / 2 > 3.5 & dbl < 53) %>%
+      collect() %>%
+      arrange(dbl),
+    rbind(
+      df1[8:10, c("chr", "dbl")],
+      df2[1:2, c("chr", "dbl")]
+    )
+  )
+
+  expect_equivalent(
+    ds %>%
+      select(chr, dbl, int) %>%
+      filter(int %/% 2L > 3 & dbl < 53) %>%
+      collect() %>%
+      arrange(dbl),
+    rbind(
+      df1[8:10, c("chr", "dbl", "int")],
+      df2[1:2, c("chr", "dbl", "int")]
+    )
+  )
+
+  expect_equivalent(
+    ds %>%
+      select(chr, dbl, int) %>%
+      filter(int %/% 2 > 3 & dbl < 53) %>%
+      collect() %>%
+      arrange(dbl),
+    rbind(
+      df1[8:10, c("chr", "dbl", "int")],
+      df2[1:2, c("chr", "dbl", "int")]
+    )
+  )
+
+  expect_equivalent(
+    ds %>%
+      select(chr, dbl, int) %>%
+      filter(int %% 2L > 0 & dbl < 53) %>%
+      collect() %>%
+      arrange(dbl),
+    rbind(
+      df1[c(1, 3, 5, 7, 9), c("chr", "dbl", "int")],
+      df2[1, c("chr", "dbl", "int")]
+    )
+  )
+
+  skip("autocasting should happen in compute kernels; R workaround fails on 
this ARROW-11078")

Review comment:
       But datasets do have autocasting, so this should work, and if it 
doesn't, sounds like a different JIRA

##########
File path: r/tests/testthat/test-compute-arith.R
##########
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("Addition", {
+  a <- Array$create(c(1:4, NA_integer_))
+  expect_type_equal(a, int32())
+  expect_type_equal(a + 4, int32())
+  expect_equal(a + 4, Array$create(c(5:8, NA_integer_)))
+  expect_identical(as.vector(a + 4), c(5:8, NA_integer_))
+  expect_equal(a + 4L, Array$create(c(5:8, NA_integer_)))
+  expect_vector(a + 4L, c(5:8, NA_integer_))
+  expect_equal(a + NA_integer_, Array$create(rep(NA_integer_, 5)))
+
+  # overflow errors — this is slightly different from R's `NA` coercion when
+  # overflowing, but better than the alternative of silently restarting
+  casted <- a$cast(int8())
+  expect_error(casted + 257)
+
+  skip("autocasting should happen in compute kernels; R workaround fails on 
this ARROW-11078")
+  expect_type_equal(a + 4.1, float64())
+  expect_equal(a + 4.1, Array$create(c(5.1, 6.1, 7.1, 8.1, NA_real_)))
+})
+
+test_that("Subtraction", {
+  a <- Array$create(c(1:4, NA_integer_))
+  expect_equal(a - 3, Array$create(c(-2:1, NA_integer_)))
+})
+
+test_that("Multiplication", {
+  a <- Array$create(c(1:4, NA_integer_))
+  expect_equal(a * 2, Array$create(c(1:4 * 2L, NA_integer_)))
+})
+
+test_that("Division", {
+  a <- Array$create(c(1:4, NA_integer_))
+  expect_equal(a / 2, Array$create(c(1:4 / 2, NA_real_)))
+  expect_equal(a %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+  expect_equal(a / 2 / 2, Array$create(c(1:4 / 2 / 2, NA_real_)))
+  expect_equal(a %/% 2 %/% 2, Array$create(c(0L, 0L, 0L, 1L, NA_integer_)))
+
+  b <- a$cast(float64())
+  expect_equal(b / 2, Array$create(c(1:4 / 2, NA_real_)))
+  expect_equal(b %/% 2, Array$create(c(0L, 1L, 1L, 2L, NA_integer_)))
+
+  # the behavior of %/% matches R's (i.e. the integer of the quotient, not
+  # simply dividing two integers)
+  expect_equal(b / 2.2, Array$create(c(1:4 / 2.2, NA_real_)))
+  # c(1:4) %/% 2.2 != c(1:4) %/% as.integer(2.2)
+  # c(1:4) %/% 2.2             == c(0L, 0L, 1L, 1L)
+  # c(1:4) %/% as.integer(2.2) == c(0L, 1L, 1L, 2L)
+  expect_equal(b %/% 2.2, Array$create(c(0L, 0L, 1L, 1L, NA_integer_)))
+
+  expect_equal(a %% 2, Array$create(c(1L, 0L, 1L, 0L, NA_integer_)))
+
+  expect_equal(b %% 2, Array$create(c(1:4 %% 2, NA_real_)))
+})
+
+test_that("Dates casting", {
+  a <- Array$create(c(Sys.Date() + 1:4, NA_integer_))
+
+  skip("autocasting should happen in compute kernels; R workaround fails on 
this ARROW-11078")
+  expect_equal(a + 2, Array$create(c((Sys.Date() + 1:4 ) + 2), NA_integer_))
+})
+
+test_that("Datetimes", {
+  a <- Array$create(c(Sys.time() + 1:4, NA_integer_))
+  b <- Scalar$create(Sys.time())
+  result <- a - b
+  expect_is(result$type, "DataType")
+  expect_identical(result$type$ToString(), "duration[us]")

Review comment:
       (1) I don't think this is right. timestamp + integer = timestamp, but 
because integer is cast to timestamp, you get duration. 
   (1b) Maybe you could support that by casting the integer to duration (same 
units as the timestamp), but this is a bunch of stuff that should probably get 
handled in C++
   (2) According to 
https://arrow.apache.org/docs/r/articles/arrow.html#arrow-to-r, we don't 
support converting duration types to R. 
   
   Given this, I think arithmetic with dates/times/timestamps should be punted 
to its own JIRA.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] nealrichardson commented on a change in pull request #8947: ARROW-9187: [R] Add bindings for arithmetic kernels

Reply via email to