This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 73cfd2d0d0 ARROW-17885: [R] Return BLOB data as list of raw instead of
a list of integers (#14277)
73cfd2d0d0 is described below
commit 73cfd2d0d0e1e5a2192fb73e5262c77953664f81
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Oct 10 17:08:34 2022 -0300
ARROW-17885: [R] Return BLOB data as list of raw instead of a list of
integers (#14277)
This PR adds support for `blob::blob()`, which is common in R database land
to denote "binary", and `vctrs::list_of()`, which is similar, easy, and helps a
bit with list of things that happen to be all NULL.
We have our own infrastructure for binary and lists of things too, which I
assume pre-dates the mature vctrs and blob? Should we consider having
`as.vector()` output those objects instead of the custom
`arrow_list/large_list/binary` classes we implement here?
Lead-authored-by: Dewey Dunnington <[email protected]>
Co-authored-by: Dewey Dunnington <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/DESCRIPTION | 1 +
r/NAMESPACE | 4 +++
r/R/array.R | 20 +++++++++++++
r/R/type.R | 14 +++++++++
r/src/r_to_arrow.cpp | 2 +-
r/src/type_infer.cpp | 29 +++++++++++-------
r/tests/testthat/_snaps/Array.md | 8 +++++
r/tests/testthat/test-Array.R | 64 +++++++++++++++++++++++++++++++++++++++-
r/tests/testthat/test-type.R | 32 ++++++++++++++++++++
9 files changed, 161 insertions(+), 13 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index cf83f56390..4b526e8b8a 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -45,6 +45,7 @@ RoxygenNote: 7.2.1
Config/testthat/edition: 3
VignetteBuilder: knitr
Suggests:
+ blob,
cli,
DBI,
dbplyr,
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 8b08b940b3..24a9e14bb6 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -41,9 +41,11 @@ S3method(as.vector,ArrowDatum)
S3method(as_arrow_array,Array)
S3method(as_arrow_array,ChunkedArray)
S3method(as_arrow_array,Scalar)
+S3method(as_arrow_array,blob)
S3method(as_arrow_array,data.frame)
S3method(as_arrow_array,default)
S3method(as_arrow_array,pyarrow.lib.Array)
+S3method(as_arrow_array,vctrs_list_of)
S3method(as_arrow_table,RecordBatch)
S3method(as_arrow_table,RecordBatchReader)
S3method(as_arrow_table,Table)
@@ -100,7 +102,9 @@ S3method(head,Scanner)
S3method(head,arrow_dplyr_query)
S3method(infer_type,ArrowDatum)
S3method(infer_type,Expression)
+S3method(infer_type,blob)
S3method(infer_type,default)
+S3method(infer_type,vctrs_list_of)
S3method(is.finite,ArrowDatum)
S3method(is.infinite,ArrowDatum)
S3method(is.na,ArrowDatum)
diff --git a/r/R/array.R b/r/R/array.R
index 938c8e4b04..7c2fb5c783 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -322,6 +322,26 @@ as_arrow_array.data.frame <- function(x, ..., type = NULL)
{
}
}
+#' @export
+as_arrow_array.vctrs_list_of <- function(x, ..., type = NULL) {
+ type <- type %||% infer_type(x)
+ if (!inherits(type, "ListType") && !inherits(type, "LargeListType")) {
+ stop_cant_convert_array(x, type)
+ }
+
+ as_arrow_array(unclass(x), type = type)
+}
+
+#' @export
+as_arrow_array.blob <- function(x, ..., type = NULL) {
+ type <- type %||% infer_type(x)
+ if (!type$Equals(binary()) && !type$Equals(large_binary())) {
+ stop_cant_convert_array(x, type)
+ }
+
+ as_arrow_array(unclass(x), type = type)
+}
+
stop_cant_convert_array <- function(x, type) {
if (is.null(type)) {
abort(
diff --git a/r/R/type.R b/r/R/type.R
index d4d7d52ad5..5089789f6c 100644
--- a/r/R/type.R
+++ b/r/R/type.R
@@ -111,6 +111,20 @@ infer_type.default <- function(x, ...,
from_array_infer_type = FALSE) {
}
}
+#' @export
+infer_type.vctrs_list_of <- function(x, ...) {
+ list_of(infer_type(attr(x, "ptype")))
+}
+
+#' @export
+infer_type.blob <- function(x, ...) {
+ if (sum(lengths(x)) > .Machine$integer.max) {
+ large_binary()
+ } else {
+ binary()
+ }
+}
+
#' @export
infer_type.ArrowDatum <- function(x, ...) x$type
diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp
index aa51799585..c472d8286f 100644
--- a/r/src/r_to_arrow.cpp
+++ b/r/src/r_to_arrow.cpp
@@ -743,7 +743,7 @@ Status check_binary(SEXP x, int64_t size) {
// check this is a list of raw vectors
const SEXP* p_x = VECTOR_PTR_RO(x);
for (R_xlen_t i = 0; i < size; i++, ++p_x) {
- if (TYPEOF(*p_x) != RAWSXP) {
+ if (TYPEOF(*p_x) != RAWSXP && (*p_x != R_NilValue)) {
return Status::Invalid("invalid R type to convert to binary");
}
}
diff --git a/r/src/type_infer.cpp b/r/src/type_infer.cpp
index e30d0e1288..e668918ac7 100644
--- a/r/src/type_infer.cpp
+++ b/r/src/type_infer.cpp
@@ -159,22 +159,29 @@ std::shared_ptr<arrow::DataType>
InferArrowTypeFromVector<VECSXP>(SEXP x) {
return arrow::large_binary();
}
+ // Check attr(x, "ptype") for an appropriate R prototype
SEXP ptype = Rf_getAttrib(x, symbols::ptype);
- if (Rf_isNull(ptype)) {
- if (XLENGTH(x) == 0) {
- cpp11::stop(
- "Requires at least one element to infer the values' type of a list
vector");
+ if (!Rf_isNull(ptype)) {
+ arrow::list(InferArrowType(ptype));
+ }
+
+ // If unspecified, iterate through the vector until we get a non-null
result
+ // special case raw() vectors, since we want list(raw()) to result in
+ // a binary() array
+ for (R_xlen_t i = 0; i < XLENGTH(x); i++) {
+ ptype = VECTOR_ELT(x, i);
+ if (Rf_isNull(ptype)) {
+ continue;
}
- // Iterate through the vector until we get a non-null result
- for (R_xlen_t i = 0; i < XLENGTH(x); i++) {
- ptype = VECTOR_ELT(x, i);
- if (!Rf_isNull(ptype)) {
- break;
- }
+
+ if (!Rf_isObject(ptype) && TYPEOF(ptype) == RAWSXP) {
+ return arrow::binary();
+ } else {
+ return arrow::list(InferArrowType(ptype));
}
}
- return arrow::list(InferArrowType(ptype));
+ return arrow::list(arrow::null());
}
}
diff --git a/r/tests/testthat/_snaps/Array.md b/r/tests/testthat/_snaps/Array.md
index f6ec523510..fbcee7a15c 100644
--- a/r/tests/testthat/_snaps/Array.md
+++ b/r/tests/testthat/_snaps/Array.md
@@ -18,6 +18,14 @@
Can't create Array<float64()> from object of type class_not_supported
+# as_arrow_array() works for blob::blob()
+
+ Can't create Array<int32()> from object of type blob / vctrs_list_of /
vctrs_vctr / list
+
+# as_arrow_array() works for vctrs::list_of()
+
+ Can't create Array<int32()> from object of type vctrs_list_of / vctrs_vctr
/ list
+
# Array doesn't support c()
Use `concat_arrays()` or `ChunkedArray$create()` instead.
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index a2299326e4..37b56bbc43 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -849,7 +849,6 @@ test_that("Array$create() should have helpful error", {
int <- integer(0)
num <- numeric(0)
char <- character(0)
- expect_error(Array$create(list()), "Requires at least one element to infer")
expect_error(Array$create(list(lgl, lgl, int)), "Expecting a logical vector")
expect_error(Array$create(list(char, num, char)), "Expecting a character
vector")
@@ -1172,6 +1171,69 @@ test_that("as_arrow_array() default method errors", {
)
})
+test_that("as_arrow_array() works for blob::blob()", {
+ skip_if_not_installed("blob")
+
+ # empty
+ expect_r6_class(as_arrow_array(blob::blob()), "Array")
+ expect_equal(
+ as_arrow_array(blob::blob()),
+ as_arrow_array(list(), type = binary())
+ )
+
+ # all null
+ expect_equal(
+ as_arrow_array(blob::blob(NULL, NULL)),
+ as_arrow_array(list(NULL, NULL), type = binary())
+ )
+
+ expect_equal(
+ as_arrow_array(blob::blob(as.raw(1:5), NULL)),
+ as_arrow_array(list(as.raw(1:5), NULL), type = binary())
+ )
+
+ expect_equal(
+ as_arrow_array(blob::blob(as.raw(1:5)), type = large_binary()),
+ as_arrow_array(list(as.raw(1:5)), type = large_binary())
+ )
+
+ expect_snapshot_error(
+ as_arrow_array(blob::blob(as.raw(1:5)), type = int32())
+ )
+})
+
+test_that("as_arrow_array() works for vctrs::list_of()", {
+ # empty
+ expect_r6_class(as_arrow_array(vctrs::list_of(.ptype = integer())), "Array")
+ expect_equal(
+ as_arrow_array(vctrs::list_of(.ptype = integer())),
+ as_arrow_array(list(), type = list_of(int32()))
+ )
+
+ # all NULL
+ expect_equal(
+ as_arrow_array(vctrs::list_of(NULL, NULL, .ptype = integer())),
+ as_arrow_array(list(NULL, NULL), type = list_of(int32()))
+ )
+
+ expect_equal(
+ as_arrow_array(vctrs::list_of(1:5, NULL, .ptype = integer())),
+ as_arrow_array(list(1:5, NULL), type = list_of(int32()))
+ )
+
+ expect_equal(
+ as_arrow_array(
+ vctrs::list_of(1:5, .ptype = integer()),
+ type = large_list_of(int32())
+ ),
+ as_arrow_array(list(1:5), type = large_list_of(int32()))
+ )
+
+ expect_snapshot_error(
+ as_arrow_array(vctrs::list_of(1:5, .ptype = integer()), type = int32())
+ )
+})
+
test_that("concat_arrays works", {
concat_empty <- concat_arrays()
expect_true(concat_empty$type == null())
diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R
index 14f0ea7a8d..0fbeec0a49 100644
--- a/r/tests/testthat/test-type.R
+++ b/r/tests/testthat/test-type.R
@@ -85,6 +85,20 @@ test_that("infer_type() can infer nested extension types", {
)
})
+test_that("infer_type() can infer vctrs::list_of() types", {
+ expect_equal(infer_type(vctrs::list_of(.ptype = integer())),
list_of(int32()))
+})
+
+test_that("infer_type() can infer blob type", {
+ skip_if_not_installed("blob")
+
+ expect_equal(infer_type(blob::blob()), binary())
+
+ big_ish_raw <- raw(2 ^ 20)
+ big_ish_blob <- blob::new_blob(rep(list(big_ish_raw), 2049))
+ expect_equal(infer_type(big_ish_blob), large_binary())
+})
+
test_that("DataType$Equals", {
a <- int32()
b <- int32()
@@ -294,6 +308,18 @@ test_that("type() is deprecated", {
expect_equal(a_type, a$type)
})
+test_that("infer_type() infers type for lists of raw() as binary()", {
+ expect_equal(
+ infer_type(list(raw())),
+ binary()
+ )
+
+ expect_equal(
+ infer_type(list(NULL, raw(), raw())),
+ binary()
+ )
+})
+
test_that("infer_type() infers type for lists starting with NULL -
ARROW-17639", {
null_start_list <- list(NULL, c(2, 3), c(4, 5))
@@ -308,4 +334,10 @@ test_that("infer_type() infers type for lists starting
with NULL - ARROW-17639",
infer_type(totally_null_list),
list_of(null())
)
+
+ empty_list <- list()
+ expect_equal(
+ infer_type(empty_list),
+ list_of(null())
+ )
})