This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 73cfd2d0d0 ARROW-17885: [R] Return BLOB data as list of raw instead of 
a list of integers (#14277)
73cfd2d0d0 is described below

commit 73cfd2d0d0e1e5a2192fb73e5262c77953664f81
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Oct 10 17:08:34 2022 -0300

    ARROW-17885: [R] Return BLOB data as list of raw instead of a list of 
integers (#14277)
    
    This PR adds support for `blob::blob()`, which is common in R database land 
to denote "binary", and `vctrs::list_of()`, which is similar, easy, and helps a 
bit with list of things that happen to be all NULL.
    
    We have our own infrastructure for binary and lists of things too, which I 
assume pre-dates the mature vctrs and blob? Should we consider having 
`as.vector()` output those objects instead of the custom 
`arrow_list/large_list/binary` classes we implement here?
    
    Lead-authored-by: Dewey Dunnington <[email protected]>
    Co-authored-by: Dewey Dunnington <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/DESCRIPTION                    |  1 +
 r/NAMESPACE                      |  4 +++
 r/R/array.R                      | 20 +++++++++++++
 r/R/type.R                       | 14 +++++++++
 r/src/r_to_arrow.cpp             |  2 +-
 r/src/type_infer.cpp             | 29 +++++++++++-------
 r/tests/testthat/_snaps/Array.md |  8 +++++
 r/tests/testthat/test-Array.R    | 64 +++++++++++++++++++++++++++++++++++++++-
 r/tests/testthat/test-type.R     | 32 ++++++++++++++++++++
 9 files changed, 161 insertions(+), 13 deletions(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index cf83f56390..4b526e8b8a 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -45,6 +45,7 @@ RoxygenNote: 7.2.1
 Config/testthat/edition: 3
 VignetteBuilder: knitr
 Suggests:
+    blob,
     cli,
     DBI,
     dbplyr,
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 8b08b940b3..24a9e14bb6 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -41,9 +41,11 @@ S3method(as.vector,ArrowDatum)
 S3method(as_arrow_array,Array)
 S3method(as_arrow_array,ChunkedArray)
 S3method(as_arrow_array,Scalar)
+S3method(as_arrow_array,blob)
 S3method(as_arrow_array,data.frame)
 S3method(as_arrow_array,default)
 S3method(as_arrow_array,pyarrow.lib.Array)
+S3method(as_arrow_array,vctrs_list_of)
 S3method(as_arrow_table,RecordBatch)
 S3method(as_arrow_table,RecordBatchReader)
 S3method(as_arrow_table,Table)
@@ -100,7 +102,9 @@ S3method(head,Scanner)
 S3method(head,arrow_dplyr_query)
 S3method(infer_type,ArrowDatum)
 S3method(infer_type,Expression)
+S3method(infer_type,blob)
 S3method(infer_type,default)
+S3method(infer_type,vctrs_list_of)
 S3method(is.finite,ArrowDatum)
 S3method(is.infinite,ArrowDatum)
 S3method(is.na,ArrowDatum)
diff --git a/r/R/array.R b/r/R/array.R
index 938c8e4b04..7c2fb5c783 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -322,6 +322,26 @@ as_arrow_array.data.frame <- function(x, ..., type = NULL) 
{
   }
 }
 
+#' @export
+as_arrow_array.vctrs_list_of <- function(x, ..., type = NULL) {
+  type <- type %||% infer_type(x)
+  if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) {
+    stop_cant_convert_array(x, type)
+  }
+
+  as_arrow_array(unclass(x), type = type)
+}
+
+#' @export
+as_arrow_array.blob <- function(x, ..., type = NULL) {
+  type <- type %||% infer_type(x)
+  if (!type$Equals(binary()) && !type$Equals(large_binary())) {
+    stop_cant_convert_array(x, type)
+  }
+
+  as_arrow_array(unclass(x), type = type)
+}
+
 stop_cant_convert_array <- function(x, type) {
   if (is.null(type)) {
     abort(
diff --git a/r/R/type.R b/r/R/type.R
index d4d7d52ad5..5089789f6c 100644
--- a/r/R/type.R
+++ b/r/R/type.R
@@ -111,6 +111,20 @@ infer_type.default <- function(x, ..., 
from_array_infer_type = FALSE) {
   }
 }
 
+#' @export
+infer_type.vctrs_list_of <- function(x, ...) {
+  list_of(infer_type(attr(x, "ptype")))
+}
+
+#' @export
+infer_type.blob <- function(x, ...) {
+  if (sum(lengths(x)) > .Machine$integer.max) {
+    large_binary()
+  } else {
+    binary()
+  }
+}
+
 #' @export
 infer_type.ArrowDatum <- function(x, ...) x$type
 
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index aa51799585..c472d8286f 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -743,7 +743,7 @@ Status check_binary(SEXP x, int64_t size) {
       // check this is a list of raw vectors
       const SEXP* p_x = VECTOR_PTR_RO(x);
       for (R_xlen_t i = 0; i < size; i++, ++p_x) {
-        if (TYPEOF(*p_x) != RAWSXP) {
+        if (TYPEOF(*p_x) != RAWSXP && (*p_x != R_NilValue)) {
           return Status::Invalid("invalid R type to convert to binary");
         }
       }
diff --git a/r/src/type_infer.cpp b/r/src/type_infer.cpp
index e30d0e1288..e668918ac7 100644
--- a/r/src/type_infer.cpp
+++ b/r/src/type_infer.cpp
@@ -159,22 +159,29 @@ std::shared_ptr<arrow::DataType> 
InferArrowTypeFromVector<VECSXP>(SEXP x) {
       return arrow::large_binary();
     }
 
+    // Check attr(x, "ptype") for an appropriate R prototype
     SEXP ptype = Rf_getAttrib(x, symbols::ptype);
-    if (Rf_isNull(ptype)) {
-      if (XLENGTH(x) == 0) {
-        cpp11::stop(
-            "Requires at least one element to infer the values' type of a list 
vector");
+    if (!Rf_isNull(ptype)) {
+      arrow::list(InferArrowType(ptype));
+    }
+
+    // If unspecified, iterate through the vector until we get a non-null 
result
+    // special case raw() vectors, since we want list(raw()) to result in
+    // a binary() array
+    for (R_xlen_t i = 0; i < XLENGTH(x); i++) {
+      ptype = VECTOR_ELT(x, i);
+      if (Rf_isNull(ptype)) {
+        continue;
       }
-      // Iterate through the vector until we get a non-null result
-      for (R_xlen_t i = 0; i < XLENGTH(x); i++) {
-        ptype = VECTOR_ELT(x, i);
-        if (!Rf_isNull(ptype)) {
-          break;
-        }
+
+      if (!Rf_isObject(ptype) && TYPEOF(ptype) == RAWSXP) {
+        return arrow::binary();
+      } else {
+        return arrow::list(InferArrowType(ptype));
       }
     }
 
-    return arrow::list(InferArrowType(ptype));
+    return arrow::list(arrow::null());
   }
 }
 
diff --git a/r/tests/testthat/_snaps/Array.md b/r/tests/testthat/_snaps/Array.md
index f6ec523510..fbcee7a15c 100644
--- a/r/tests/testthat/_snaps/Array.md
+++ b/r/tests/testthat/_snaps/Array.md
@@ -18,6 +18,14 @@
 
     Can't create Array<float64()> from object of type class_not_supported
 
+# as_arrow_array() works for blob::blob()
+
+    Can't create Array<int32()> from object of type blob / vctrs_list_of / 
vctrs_vctr / list
+
+# as_arrow_array() works for vctrs::list_of()
+
+    Can't create Array<int32()> from object of type vctrs_list_of / vctrs_vctr 
/ list
+
 # Array doesn't support c()
 
     Use `concat_arrays()` or `ChunkedArray$create()` instead.
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index a2299326e4..37b56bbc43 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -849,7 +849,6 @@ test_that("Array$create() should have helpful error", {
   int <- integer(0)
   num <- numeric(0)
   char <- character(0)
-  expect_error(Array$create(list()), "Requires at least one element to infer")
   expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector")
   expect_error(Array$create(list(char, num, char)), "Expecting a character 
vector")
 
@@ -1172,6 +1171,69 @@ test_that("as_arrow_array() default method errors", {
   )
 })
 
+test_that("as_arrow_array() works for blob::blob()", {
+  skip_if_not_installed("blob")
+
+  # empty
+  expect_r6_class(as_arrow_array(blob::blob()), "Array")
+  expect_equal(
+    as_arrow_array(blob::blob()),
+    as_arrow_array(list(), type = binary())
+  )
+
+  # all null
+  expect_equal(
+    as_arrow_array(blob::blob(NULL, NULL)),
+    as_arrow_array(list(NULL, NULL), type = binary())
+  )
+
+  expect_equal(
+    as_arrow_array(blob::blob(as.raw(1:5), NULL)),
+    as_arrow_array(list(as.raw(1:5), NULL), type = binary())
+  )
+
+  expect_equal(
+    as_arrow_array(blob::blob(as.raw(1:5)), type = large_binary()),
+    as_arrow_array(list(as.raw(1:5)), type = large_binary())
+  )
+
+  expect_snapshot_error(
+    as_arrow_array(blob::blob(as.raw(1:5)), type = int32())
+  )
+})
+
+test_that("as_arrow_array() works for vctrs::list_of()", {
+  # empty
+  expect_r6_class(as_arrow_array(vctrs::list_of(.ptype = integer())), "Array")
+  expect_equal(
+    as_arrow_array(vctrs::list_of(.ptype = integer())),
+    as_arrow_array(list(), type = list_of(int32()))
+  )
+
+  # all NULL
+  expect_equal(
+    as_arrow_array(vctrs::list_of(NULL, NULL, .ptype = integer())),
+    as_arrow_array(list(NULL, NULL), type = list_of(int32()))
+  )
+
+  expect_equal(
+    as_arrow_array(vctrs::list_of(1:5, NULL, .ptype = integer())),
+    as_arrow_array(list(1:5, NULL), type = list_of(int32()))
+  )
+
+  expect_equal(
+    as_arrow_array(
+      vctrs::list_of(1:5, .ptype = integer()),
+      type = large_list_of(int32())
+    ),
+    as_arrow_array(list(1:5), type = large_list_of(int32()))
+  )
+
+  expect_snapshot_error(
+    as_arrow_array(vctrs::list_of(1:5, .ptype = integer()), type = int32())
+  )
+})
+
 test_that("concat_arrays works", {
   concat_empty <- concat_arrays()
   expect_true(concat_empty$type == null())
diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R
index 14f0ea7a8d..0fbeec0a49 100644
--- a/r/tests/testthat/test-type.R
+++ b/r/tests/testthat/test-type.R
@@ -85,6 +85,20 @@ test_that("infer_type() can infer nested extension types", {
   )
 })
 
+test_that("infer_type() can infer vctrs::list_of() types", {
+  expect_equal(infer_type(vctrs::list_of(.ptype = integer())), 
list_of(int32()))
+})
+
+test_that("infer_type() can infer blob type", {
+  skip_if_not_installed("blob")
+
+  expect_equal(infer_type(blob::blob()), binary())
+
+  big_ish_raw <- raw(2 ^ 20)
+  big_ish_blob <- blob::new_blob(rep(list(big_ish_raw), 2049))
+  expect_equal(infer_type(big_ish_blob), large_binary())
+})
+
 test_that("DataType$Equals", {
   a <- int32()
   b <- int32()
@@ -294,6 +308,18 @@ test_that("type() is deprecated", {
   expect_equal(a_type, a$type)
 })
 
+test_that("infer_type() infers type for lists of raw() as binary()", {
+  expect_equal(
+    infer_type(list(raw())),
+    binary()
+  )
+
+  expect_equal(
+    infer_type(list(NULL, raw(), raw())),
+    binary()
+  )
+})
+
 test_that("infer_type() infers type for lists starting with NULL - 
ARROW-17639", {
   null_start_list <- list(NULL, c(2, 3), c(4, 5))
 
@@ -308,4 +334,10 @@ test_that("infer_type() infers type for lists starting 
with NULL - ARROW-17639",
     infer_type(totally_null_list),
     list_of(null())
   )
+
+  empty_list <- list()
+  expect_equal(
+    infer_type(empty_list),
+    list_of(null())
+  )
 })

Reply via email to