This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4ace1a2 feat(r): Implement dictionary conversion (#285)
4ace1a2 is described below
commit 4ace1a2522f1f9f06011e88992d429eb678412fa
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Aug 23 14:24:19 2023 -0300
feat(r): Implement dictionary conversion (#285)
This PR implements conversion from dictionary-encoded arrays to R
vectors. To make this happen, it was also necessary to refactor a few
things about the conversion process to make writing conversions in R
easier. This is also with an eye towards extension type conversion (up
next) where it is expected that extension types will mostly implement
conversions as S3 `convert_array()` method. Previously this didn't work
if converting a stream with multiple batches...now it does (although
will incur S3 dispatch overhead for each batch for conversions that are
not handled in C).
Conversion from a dictionary always defaults to the conversion that
would happen from the dictionary value type. This is because in Arrow
the dictionary is a property of the array, not the type (whereas in R, a
factor's levels are a property of the type, sort of). For a
dictionary-encoded array that arrives in chunks, the only type-stable
way to perform the conversion is by resolving the dictionary into the
value type. This also makes it a more predictable default for other
dictionary-encoded types (none of which have an R analogue).
In practice this has low memory overhead because the most commonly
dictionary-encoded type is a string, and because R has a global string
pool; however, it is quite a bit slower than decoding directly to
`factor()`.
Like other conversions (and like `vctrs::vec_cast()`), you can also
request a target `ptype`. This is implemented for `factor()`. If
converting an array stream containing dictionary-encoded values, the
`levels` need to be explicitly specified.
``` r
library(nanoarrow)
dict_array <- as_nanoarrow_array(sample(0:9, 10, replace = TRUE))
dict_array$dictionary <- as_nanoarrow_array(stringr::words[1:10])
# default conversion is always the diciontary (value) type
infer_nanoarrow_ptype(dict_array)
#> character(0)
convert_array(dict_array)
#> [1] "absolute" "a" "across" "achieve" "accept" "a"
#> [7] "active" "a" "a" "across"
# this makes it type stable when consuming streams
dict1 <- as_nanoarrow_array(factor(letters[1:5]))
dict2 <- as_nanoarrow_array(factor(letters[6:10]))
stream <- basic_array_stream(list(dict1, dict2))
convert_array_stream(stream)
#> [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
# ...and more consistent when dealing with other value types
dict_array$dictionary <-
as_nanoarrow_array(arrow::as_arrow_array(lapply(1:10, function(i) 1:5 * i)))
infer_nanoarrow_ptype(dict_array)
#> <list_of<integer>[0]>
# If you want a factor, you can specify it explicitly
dict_array$dictionary <- as_nanoarrow_array(stringr::words[1:10])
convert_array(dict_array, vctrs::partial_factor())
#> [1] "absolute" "a" "across" "achieve" "accept" "a"
#> [7] "active" "a" "a" "across"
convert_array(dict_array, factor(levels = stringr::words[10:1]))
#> [1] absolute a across achieve accept a active a
#> [9] a across
#> Levels: active act across achieve account accept absolute about able a
# Also works for nesting
nested_with_dict <- as_nanoarrow_array(data.frame(x = factor(letters[1:5])))
convert_array(nested_with_dict)
#> x
#> 1 a
#> 2 b
#> 3 c
#> 4 d
#> 5 e
convert_array(nested_with_dict, vctrs::partial_frame(x = factor()))
#> x
#> 1 a
#> 2 b
#> 3 c
#> 4 d
#> 5 e
```
It's hard to benchmark this because ALTREP makes the effect of some
conversions deferred, but it seems like the performance characteristics
are vaguely similar to what happens in Arrow (probably Arrow is faster
for many chunks):
``` r
library(nanoarrow)
num_levels <- 1e1
vec_len <- 1e6
vec <- nanoarrow:::vec_gen(character(), n = num_levels)
vec_chr <- do.call(c, rep(list(vec), vec_len / num_levels))
vec_chr <- nanoarrow:::vec_shuffle(vec_chr)
vec_fct <- factor(vec_chr)
levels <- levels(vec_fct)
array_chr <- as_nanoarrow_array(vec_chr)
array_fct <- as_nanoarrow_array(vec_fct)
arrow_chr <- arrow::as_arrow_array(array_chr)
arrow_fct <- arrow::as_arrow_array(array_fct)
bench::mark(
convert_array(array_chr),
as.vector(arrow_chr)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 convert_array(array_chr) 2.3µs 2.71µs 328706. 23.95KB 65.8
#> 2 as.vector(arrow_chr) 4.26µs 4.8µs 191042. 7.87KB 38.2
bench::mark(
convert_array(array_fct),
convert_array(array_fct, factor()),
convert_array(array_fct, factor(levels = levels)),
as.vector(arrow_fct$cast(arrow::string())),
as.vector(arrow_fct),
check = FALSE
)
#> # A tibble: 5 × 6
#> expression min median
`itr/sec`
#> <bch:expr> <bch:tm> <bch:tm>
<dbl>
#> 1 convert_array(array_fct) 40.55ms 41.06ms
24.4
#> 2 convert_array(array_fct, factor()) 2.97ms 3.18ms
306.
#> 3 convert_array(array_fct, factor(levels = levels)) 2.83ms 3.15ms
314.
#> 4 as.vector(arrow_fct$cast(arrow::string())) 48.35ms 49.18ms
20.2
#> 5 as.vector(arrow_fct) 6.19µs 7.22µs
110825.
#> # ℹ 2 more variables: mem_alloc <bch:byt>, `gc/sec` <dbl>
# Mitigate ALTREP
bench::mark(
convert_array(array_fct)[1e6:1],
convert_array(array_fct, factor())[1e6:1],
convert_array(array_fct, factor(levels = levels))[1e6:1],
as.vector(arrow_fct$cast(arrow::string()))[1e6:1],
as.vector(arrow_fct)[1e6:1],
check = FALSE
)
#> # A tibble: 5 × 6
#> expression min
median
#> <bch:expr> <bch:tm>
<bch:tm>
#> 1 convert_array(array_fct)[1e+06:1] 45.87ms
45.87ms
#> 2 convert_array(array_fct, factor())[1e+06:1] 5.42ms
5.78ms
#> 3 convert_array(array_fct, factor(levels = levels))[1e+06:1] 5.39ms
5.79ms
#> 4 as.vector(arrow_fct$cast(arrow::string()))[1e+06:1] 92.85ms
94.93ms
#> 5 as.vector(arrow_fct)[1e+06:1] 15.42ms
15.76ms
#> # ℹ 3 more variables: `itr/sec` <dbl>, mem_alloc <bch:byt>, `gc/sec`
<dbl>
```
<sup>Created on 2023-08-23 with [reprex
v2.0.2](https://reprex.tidyverse.org)</sup>
---------
Co-authored-by: Kirill Müller <[email protected]>
---
r/NAMESPACE | 2 +
r/R/convert-array.R | 114 +++++++++++++++++++++++++------
r/R/infer-ptype.R | 9 ++-
r/man/as_nanoarrow_schema.Rd | 4 +-
r/man/convert_array.Rd | 9 ++-
r/src/convert.c | 7 --
r/src/convert_array.c | 22 +++---
r/src/materialize.c | 116 ++++++++++++++++++++++++++++++-
r/src/materialize_blob.h | 2 +-
r/src/materialize_chr.h | 23 +++----
r/src/materialize_common.h | 1 +
r/src/materialize_date.h | 2 +-
r/src/materialize_dbl.h | 34 +--------
r/src/materialize_int.h | 4 ++
r/src/materialize_lgl.h | 4 ++
r/src/materialize_unspecified.h | 4 ++
r/tests/testthat/test-convert-array.R | 125 ++++++++++++++++++++++++++++++----
r/tests/testthat/test-infer-ptype.R | 2 +-
18 files changed, 382 insertions(+), 102 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 51a4658..aa19def 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -43,6 +43,8 @@ S3method(as_nanoarrow_schema,Field)
S3method(as_nanoarrow_schema,Schema)
S3method(as_nanoarrow_schema,nanoarrow_schema)
S3method(convert_array,default)
+S3method(convert_array,double)
+S3method(convert_array,factor)
S3method(convert_array,vctrs_partial_frame)
S3method(format,nanoarrow_array)
S3method(format,nanoarrow_array_stream)
diff --git a/r/R/convert-array.R b/r/R/convert-array.R
index 97ecc14..dcf5715 100644
--- a/r/R/convert-array.R
+++ b/r/R/convert-array.R
@@ -49,7 +49,13 @@
#' - [character()]: String and large string types can be converted to
#' [character()]. The conversion does not check for valid UTF-8: if you need
#' finer-grained control over encodings, use `to = blob::blob()`.
-#' - [Date][as.Date]: Only the date32 type can be converted to an R Date
vector.
+#' - [factor()]: Dictionary-encoded arrays of strings can be converted to
+#' `factor()`; however, this must be specified explicitly (i.e.,
+#' `convert_array(array, factor())`) because arrays arriving
+#' in chunks can have dictionaries that contain different levels. Use
+#' `convert_array(array, factor(levels = c(...)))` to materialize an array
+#' into a vector with known levels.
+#' - [Date][as.Date()]: Only the date32 type can be converted to an R Date
vector.
#' - [hms::hms()]: Time32 and time64 types can be converted to [hms::hms()].
#' - [difftime()]: Time32, time64, and duration types can be converted to
#' R [difftime()] vectors. The value is converted to match the [units()]
@@ -64,8 +70,7 @@
#'
#' In addition to the above conversions, a null array may be converted to any
#' target prototype except [data.frame()]. Extension arrays are currently
-#' converted as their storage type; dictionary-encoded arrays are not
-#' currently supported.
+#' converted as their storage type.
#'
#' @examples
#' array <- as_nanoarrow_array(data.frame(x = 1:5))
@@ -80,6 +85,16 @@ convert_array <- function(array, to = NULL, ...) {
#' @export
convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
if (.from_c) {
+ # Handle default dictionary conversion since it's the same for all types
+ dictionary <- array$dictionary
+
+ if (!is.null(dictionary)) {
+ values <- .Call(nanoarrow_c_convert_array, dictionary, to)
+ array$dictionary <- NULL
+ indices <- .Call(nanoarrow_c_convert_array, array, integer())
+ return(values[indices + 1L])
+ }
+
stop_cant_convert_array(array, to)
}
@@ -96,10 +111,45 @@ convert_array.default <- function(array, to = NULL, ...,
.from_c = FALSE) {
# we call convert_array() to dispatch to conversions defined via S3
# dispatch, making sure to let the default method know that we've already
# tried the internal C conversions.
-convert_array_from_c <- function(array, to) {
+convert_fallback_other <- function(array, offset, length, to) {
+ # If we need to modify offset/length, do it using a shallow copy.
+ if (!is.null(offset)) {
+ array <- nanoarrow_array_modify(
+ array,
+ list(offset = offset, length = length),
+ validate = FALSE
+ )
+ }
+
+ # Call convert_array() on a single chunk. Use .from_c = TRUE to ensure that
+ # methods do not attempt to pass the same array back to the C conversions.
+ # When the result is passed back to C it is checked enough to avoid segfault
+ # but not necessarily for correctness (e.g., factors with levels that don't
+ # correspond to 'to'). This result may be used as-is or may be copied into
+ # a slice of another vector.
convert_array(array, to, .from_c = TRUE)
}
+#' @export
+convert_array.double <- function(array, to, ...) {
+ # Handle conversion from decimal128 via arrow
+ schema <- infer_nanoarrow_schema(array)
+ parsed <- nanoarrow_schema_parse(schema)
+ if (parsed$type == "decimal128") {
+ assert_arrow_installed(
+ sprintf(
+ "convert %s array to object of type double",
+ nanoarrow_schema_formatted(schema)
+ )
+ )
+
+ arrow_array <- as_arrow_array.nanoarrow_array(array)
+ arrow_array$as_vector()
+ } else {
+ NextMethod()
+ }
+}
+
#' @export
convert_array.vctrs_partial_frame <- function(array, to, ...) {
ptype <- infer_nanoarrow_ptype(array)
@@ -111,6 +161,45 @@ convert_array.vctrs_partial_frame <- function(array, to,
...) {
.Call(nanoarrow_c_convert_array, array, ptype)
}
+#' @export
+convert_array.factor <- function(array, to, ...) {
+ if (!is.null(array$dictionary)) {
+ levels_final <- levels(to)
+ levels <- convert_array(array$dictionary, character())
+ array$dictionary <- NULL
+ indices <- convert_array(array, integer()) + 1L
+
+ # Handle empty factor() as the sentinel for "auto levels"
+ if (identical(levels(to), character())) {
+ levels(to) <- levels
+ }
+
+ if (identical(levels, levels(to))) {
+ fct_data <- indices
+ } else if (all(levels %in% levels(to))) {
+ level_map <- match(levels, levels(to))
+ fct_data <- level_map[indices]
+ } else {
+ stop("Error converting to factor: some levels in data do not exist in
levels")
+ }
+ } else {
+ strings <- convert_array(array, character())
+
+ # Handle empty factor() as the sentinel for "auto levels"
+ if (identical(levels(to), character())) {
+ fct_data <- factor(strings, levels)
+ levels(to) <- levels(fct_data)
+ } else {
+ fct_data <- factor(strings, levels = levels(to))
+ }
+ }
+
+ # Restore other attributes (e.g., ordered, labels)
+ attributes(fct_data) <- attributes(to)
+ fct_data
+}
+
+
stop_cant_convert_array <- function(array, to, n = 0) {
stop_cant_convert_schema(infer_nanoarrow_schema(array), to, n - 1)
}
@@ -141,20 +230,3 @@ stop_cant_convert_schema <- function(schema, to, n = 0) {
stop(cnd)
}
-
-# Called from C for decimal types
-convert_decimal_to_double <- function(array, schema, offset, length) {
- assert_arrow_installed(
- sprintf(
- "convert %s array to object of type double",
- nanoarrow_schema_formatted(schema)
- )
- )
-
- array2 <- nanoarrow_allocate_array()
- schema2 <- nanoarrow_allocate_schema()
- nanoarrow_pointer_export(array, array2)
- nanoarrow_pointer_export(schema, schema2)
- arrow_array <- arrow::Array$import_from_c(array2, schema2)
- arrow_array$Slice(offset, length)$as_vector()
-}
diff --git a/r/R/infer-ptype.R b/r/R/infer-ptype.R
index 460c4cb..3c38e2f 100644
--- a/r/R/infer-ptype.R
+++ b/r/R/infer-ptype.R
@@ -98,6 +98,13 @@ infer_ptype_other <- function(schema) {
ptype <- infer_nanoarrow_ptype(schema$children[[1]])
vctrs::list_of(.ptype = ptype)
},
+ "dictionary" = {
+ # Even though R's 'factor' can handle a dictionary of strings
+ # (perhaps the most common case), an array arriving in chunks may have
+ # different dictionary arrays. Thus, the best type-stable default we can
+ # achieve is to expand dictionaries.
+ infer_nanoarrow_ptype(schema$dictionary)
+ },
stop_cant_infer_ptype(schema, n = -1)
)
}
@@ -108,7 +115,7 @@ stop_cant_infer_ptype <- function(schema, n = 0) {
if (is.null(schema$name) || identical(schema$name, "")) {
cnd <- simpleError(
sprintf(
- "Can't infer R vector type for array <%s>",
+ "Can't infer R vector type for <%s>",
schema_label
),
call = sys.call(n - 1)
diff --git a/r/man/as_nanoarrow_schema.Rd b/r/man/as_nanoarrow_schema.Rd
index db32ff3..20f97f6 100644
--- a/r/man/as_nanoarrow_schema.Rd
+++ b/r/man/as_nanoarrow_schema.Rd
@@ -33,8 +33,8 @@ An object of class 'nanoarrow_schema'
\description{
In nanoarrow a 'schema' refers to a \verb{struct ArrowSchema} as defined in the
Arrow C Data interface. This data structure can be used to represent an
-\code{\link[arrow:schema]{arrow::schema()}}, an
\code{\link[arrow:Field]{arrow::field()}}, or an \code{arrow::DataType}. Note
that
-in nanoarrow, an \code{\link[arrow:schema]{arrow::schema()}} and a
non-nullable \code{\link[arrow:data-type]{arrow::struct()}}
+\code{\link[arrow:Schema]{arrow::schema()}}, an
\code{\link[arrow:Field]{arrow::field()}}, or an \code{arrow::DataType}. Note
that
+in nanoarrow, an \code{\link[arrow:Schema]{arrow::schema()}} and a
non-nullable \code{\link[arrow:data-type]{arrow::struct()}}
are represented identically.
}
\examples{
diff --git a/r/man/convert_array.Rd b/r/man/convert_array.Rd
index bb45ee9..6ed0989 100644
--- a/r/man/convert_array.Rd
+++ b/r/man/convert_array.Rd
@@ -41,6 +41,12 @@ through a floating-point double (e.g., very large uint64 and
int64 values).
\item \code{\link[=character]{character()}}: String and large string types can
be converted to
\code{\link[=character]{character()}}. The conversion does not check for valid
UTF-8: if you need
finer-grained control over encodings, use \code{to = blob::blob()}.
+\item \code{\link[=factor]{factor()}}: Dictionary-encoded arrays of strings
can be converted to
+\code{factor()}; however, this must be specified explicitly (i.e.,
+\code{convert_array(array, factor())}) because arrays arriving
+in chunks can have dictionaries that contain different levels. Use
+\code{convert_array(array, factor(levels = c(...)))} to materialize an array
+into a vector with known levels.
\item \link[=as.Date]{Date}: Only the date32 type can be converted to an R
Date vector.
\item \code{\link[hms:hms]{hms::hms()}}: Time32 and time64 types can be
converted to \code{\link[hms:hms]{hms::hms()}}.
\item \code{\link[=difftime]{difftime()}}: Time32, time64, and duration types
can be converted to
@@ -57,8 +63,7 @@ however, a warning will be raised if any non-null values are
encountered.
In addition to the above conversions, a null array may be converted to any
target prototype except \code{\link[=data.frame]{data.frame()}}. Extension
arrays are currently
-converted as their storage type; dictionary-encoded arrays are not
-currently supported.
+converted as their storage type.
}
\examples{
array <- as_nanoarrow_array(data.frame(x = 1:5))
diff --git a/r/src/convert.c b/r/src/convert.c
index 6590806..22bbfaa 100644
--- a/r/src/convert.c
+++ b/r/src/convert.c
@@ -309,13 +309,6 @@ int nanoarrow_converter_set_schema(SEXP converter_xptr,
SEXP schema_xptr) {
UNPROTECT(1);
}
- // Sub-par error for dictionary types until we have a way to deal with them
- if (converter->schema_view.type == NANOARROW_TYPE_DICTIONARY) {
- ArrowErrorSet(&converter->error,
- "Conversion to dictionary-encoded array is not supported");
- return ENOTSUP;
- }
-
SET_VECTOR_ELT(converter_shelter, 1, schema_xptr);
ArrowArrayViewReset(&converter->array_view);
diff --git a/r/src/convert_array.c b/r/src/convert_array.c
index 0cd66be..8bb698a 100644
--- a/r/src/convert_array.c
+++ b/r/src/convert_array.c
@@ -41,8 +41,9 @@ enum VectorType nanoarrow_infer_vector_type_array(SEXP
array_xptr);
// dispatch to find a convert_array() method (or error if there
// isn't one)
static SEXP call_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
- SEXP fun = PROTECT(Rf_install("convert_array_from_c"));
- SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp));
+ SEXP fun = PROTECT(Rf_install("convert_fallback_other"));
+ // offset/length don't need to be modified in this case
+ SEXP call = PROTECT(Rf_lang5(fun, array_xptr, R_NilValue, R_NilValue,
ptype_sexp));
SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
UNPROTECT(3);
return result;
@@ -100,12 +101,17 @@ static SEXP convert_array_default(SEXP array_xptr, enum
VectorType vector_type,
}
static SEXP convert_array_chr(SEXP array_xptr) {
- SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
- if (result == R_NilValue) {
- call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
+ struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
+ if (array->dictionary == NULL) {
+ SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
+ }
+ UNPROTECT(1);
+ return result;
+ } else {
+ return convert_array_default(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
}
- UNPROTECT(1);
- return result;
}
SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
@@ -210,7 +216,7 @@ SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP
ptype_sexp) {
Rf_inherits(ptype_sexp, "Date") || Rf_inherits(ptype_sexp,
"hms") ||
Rf_inherits(ptype_sexp, "POSIXct") ||
Rf_inherits(ptype_sexp, "difftime")) {
- return convert_array_default(array_xptr, VECTOR_TYPE_OTHER, ptype_sexp);
+ return convert_array_default(array_xptr, VECTOR_TYPE_UNINITIALIZED,
ptype_sexp);
} else {
return call_convert_array(array_xptr, ptype_sexp);
}
diff --git a/r/src/materialize.c b/r/src/materialize.c
index 48b8c32..e4c8db7 100644
--- a/r/src/materialize.c
+++ b/r/src/materialize.c
@@ -21,6 +21,8 @@
#include "nanoarrow.h"
+#include "util.h"
+
// Needed for the list_of materializer
#include "convert.h"
@@ -109,6 +111,16 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t
len) {
SEXP result;
if (Rf_isObject(ptype)) {
+ // There may be a more accurate test that more precisely captures the case
+ // where a user has specified a valid ptype that doesn't work in a
preallocate
+ // + fill conversion.
+ if (Rf_inherits(ptype, "factor")) {
+ SEXP levels = Rf_getAttrib(ptype, R_LevelsSymbol);
+ if (Rf_length(levels) == 0) {
+ Rf_error("Can't allocate ptype of class 'factor' with empty levels");
+ }
+ }
+
if (nanoarrow_ptype_is_data_frame(ptype)) {
R_xlen_t num_cols = Rf_xlength(ptype);
result = PROTECT(Rf_allocVector(VECSXP, num_cols));
@@ -178,12 +190,95 @@ static void fill_vec_with_nulls(SEXP x, R_xlen_t offset,
R_xlen_t len) {
}
}
+static int nanoarrow_materialize_other(struct RConverter* converter,
+ SEXP converter_xptr) {
+ // Ensure that we have a ptype SEXP to send in the call back to R
+ if (converter->ptype_view.ptype == R_NilValue) {
+ SEXP ptype =
PROTECT(nanoarrow_alloc_type(converter->ptype_view.vector_type, 0));
+ converter->ptype_view.ptype = ptype;
+ SET_VECTOR_ELT(R_ExternalPtrProtected(converter_xptr), 0, ptype);
+ UNPROTECT(1);
+ }
+
+ // A unique situation where we don't want owning external pointers because
we know
+ // these are protected for the duration of our call into R and because we
don't want
+ // the underlying array to be released and invalidate the converter. The R
code in
+ // convert_fallback_other() takes care of ensuring an independent copy with
the correct
+ // offset/length.
+ SEXP schema_xptr =
+ PROTECT(R_MakeExternalPtr(converter->schema_view.schema, R_NilValue,
R_NilValue));
+ Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
+ // We do need to set the protected member of the array external pointer to
signal that
+ // it is not an independent array (i.e., force a shallow copy).
+ SEXP array_xptr = PROTECT(
+ R_MakeExternalPtr(converter->array_view.array, schema_xptr,
converter_xptr));
+ Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
+
+ SEXP offset_sexp =
+ PROTECT(Rf_ScalarReal(converter->src.array_view->offset +
converter->src.offset));
+ SEXP length_sexp = PROTECT(Rf_ScalarReal(converter->src.length));
+
+ SEXP fun = PROTECT(Rf_install("convert_fallback_other"));
+ SEXP call = PROTECT(
+ Rf_lang5(fun, array_xptr, offset_sexp, length_sexp,
converter->ptype_view.ptype));
+ SEXP result_src = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
+
+ // Currently this method can only handle the case where result_src and dst
have the same
+ // SEXP type and length. This won't work for a data frame/record array-like
result.
+ if (Rf_xlength(result_src) != converter->dst.length) {
+ Rf_error("Unexpected length in result of
nanoarrow:::convert_fallback_other()");
+ }
+
+ if (TYPEOF(result_src) != TYPEOF(converter->dst.vec_sexp)) {
+ Rf_error("Unexpected SEXP type in result of
nanoarrow:::convert_fallback_other()");
+ }
+
+ switch (TYPEOF(result_src)) {
+ case REALSXP:
+ memcpy(REAL(converter->dst.vec_sexp) + converter->dst.offset,
REAL(result_src),
+ converter->dst.length * sizeof(double));
+ break;
+ case INTSXP:
+ case LGLSXP:
+ memcpy(INTEGER(converter->dst.vec_sexp) + converter->dst.offset,
+ INTEGER(result_src), converter->dst.length * sizeof(int));
+ break;
+ case STRSXP:
+ for (R_xlen_t i = 0; i < converter->dst.length; i++) {
+ SET_STRING_ELT(converter->dst.vec_sexp, converter->dst.offset + i,
+ STRING_ELT(result_src, i));
+ }
+ break;
+ case VECSXP:
+ for (R_xlen_t i = 0; i < converter->dst.length; i++) {
+ SET_VECTOR_ELT(converter->dst.vec_sexp, converter->dst.offset + i,
+ VECTOR_ELT(result_src, i));
+ }
+ break;
+ case NILSXP:
+ // Do nothing if the function returned NULL
+ break;
+ default:
+ Rf_error(
+ "Unhandled SEXP type in conversion of
nanoarrow:::convert_fallback_other()");
+ break;
+ }
+
+ UNPROTECT(7);
+ return NANOARROW_OK;
+}
+
static int nanoarrow_materialize_data_frame(struct RConverter* converter,
SEXP converter_xptr) {
if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) {
return EINVAL;
}
+ // Make sure we error for dictionary types
+ if (converter->src.array_view->array->dictionary != NULL) {
+ return EINVAL;
+ }
+
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
@@ -256,6 +351,11 @@ static int nanoarrow_materialize_list_of(struct
RConverter* converter,
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
+ // Make sure we error for dictionary types
+ if (src->array_view->array->dictionary != NULL) {
+ return EINVAL;
+ }
+
const int32_t* offsets = src->array_view->buffer_views[1].data.as_int32;
const int64_t* large_offsets =
src->array_view->buffer_views[1].data.as_int64;
int64_t raw_src_offset = src->array_view->array->offset + src->offset;
@@ -309,7 +409,7 @@ static int nanoarrow_materialize_list_of(struct RConverter*
converter,
return NANOARROW_OK;
}
-int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) {
+static int nanoarrow_materialize_base(struct RConverter* converter, SEXP
converter_xptr) {
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
struct MaterializeOptions* options = converter->options;
@@ -324,7 +424,7 @@ int nanoarrow_materialize(struct RConverter* converter,
SEXP converter_xptr) {
case VECTOR_TYPE_DBL:
return nanoarrow_materialize_dbl(converter);
case VECTOR_TYPE_CHR:
- return nanoarrow_materialize_chr(src, dst, options);
+ return nanoarrow_materialize_chr(converter);
case VECTOR_TYPE_POSIXCT:
return nanoarrow_materialize_posixct(converter);
case VECTOR_TYPE_DATE:
@@ -338,6 +438,16 @@ int nanoarrow_materialize(struct RConverter* converter,
SEXP converter_xptr) {
case VECTOR_TYPE_DATA_FRAME:
return nanoarrow_materialize_data_frame(converter, converter_xptr);
default:
- return ENOTSUP;
+ return nanoarrow_materialize_other(converter, converter_xptr);
+ }
+}
+
+int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) {
+ int result = nanoarrow_materialize_base(converter, converter_xptr);
+
+ if (result != NANOARROW_OK) {
+ return nanoarrow_materialize_other(converter, converter_xptr);
+ } else {
+ return NANOARROW_OK;
}
}
diff --git a/r/src/materialize_blob.h b/r/src/materialize_blob.h
index b9919e8..eb6faba 100644
--- a/r/src/materialize_blob.h
+++ b/r/src/materialize_blob.h
@@ -35,7 +35,7 @@ static inline int nanoarrow_materialize_blob(struct
ArrayViewSlice* src,
case NANOARROW_TYPE_LARGE_BINARY:
break;
default:
- return EINVAL;
+ return ENOTSUP;
}
if (src->array_view->storage_type == NANOARROW_TYPE_NA) {
diff --git a/r/src/materialize_chr.h b/r/src/materialize_chr.h
index b8f1beb..2d57fdc 100644
--- a/r/src/materialize_chr.h
+++ b/r/src/materialize_chr.h
@@ -24,24 +24,23 @@
#include "materialize_common.h"
#include "nanoarrow.h"
-static inline int nanoarrow_materialize_chr(struct ArrayViewSlice* src,
- struct VectorSlice* dst,
- struct MaterializeOptions*
options) {
+static inline int nanoarrow_materialize_chr(struct RConverter* converter) {
+ struct ArrayViewSlice* src = &converter->src;
+ struct VectorSlice* dst = &converter->dst;
+
switch (src->array_view->storage_type) {
case NANOARROW_TYPE_NA:
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING);
+ }
+ return NANOARROW_OK;
+
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
break;
- default:
- return EINVAL;
- }
- if (src->array_view->storage_type == NANOARROW_TYPE_NA) {
- for (R_xlen_t i = 0; i < dst->length; i++) {
- SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING);
- }
-
- return NANOARROW_OK;
+ default:
+ return ENOTSUP;
}
struct ArrowStringView item;
diff --git a/r/src/materialize_common.h b/r/src/materialize_common.h
index e084f05..22f74c5 100644
--- a/r/src/materialize_common.h
+++ b/r/src/materialize_common.h
@@ -27,6 +27,7 @@
// or looping at the R level. Some of these types also need an SEXP ptype to
communicate
// additional information.
enum VectorType {
+ VECTOR_TYPE_UNINITIALIZED,
VECTOR_TYPE_NULL,
VECTOR_TYPE_UNSPECIFIED,
VECTOR_TYPE_LGL,
diff --git a/r/src/materialize_date.h b/r/src/materialize_date.h
index 9995e0e..925c895 100644
--- a/r/src/materialize_date.h
+++ b/r/src/materialize_date.h
@@ -37,7 +37,7 @@ static int nanoarrow_materialize_date(struct RConverter*
converter) {
}
}
- return EINVAL;
+ return ENOTSUP;
}
#endif
diff --git a/r/src/materialize_dbl.h b/r/src/materialize_dbl.h
index 865ef4a..3bebe47 100644
--- a/r/src/materialize_dbl.h
+++ b/r/src/materialize_dbl.h
@@ -23,36 +23,12 @@
#include "materialize_common.h"
#include "nanoarrow.h"
-#include "util.h"
-// Fall back to arrow for decimal conversion via a package helper
-static inline void nanoarrow_materialize_decimal_to_dbl(struct RConverter*
converter) {
- // A unique situation where we don't want owning external pointers because
we know
- // these are protected for the duration of our call into R and because we
don't want
- // then to be garbage collected and invalidate the converter
- SEXP array_xptr =
- PROTECT(R_MakeExternalPtr(converter->array_view.array, R_NilValue,
R_NilValue));
- Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
- SEXP schema_xptr =
- PROTECT(R_MakeExternalPtr(converter->schema_view.schema, R_NilValue,
R_NilValue));
- Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
-
- SEXP offset_sexp = PROTECT(Rf_ScalarReal(converter->src.offset));
- SEXP length_sexp = PROTECT(Rf_ScalarReal(converter->src.length));
-
- SEXP fun = PROTECT(Rf_install("convert_decimal_to_double"));
- SEXP call = PROTECT(Rf_lang5(fun, array_xptr, schema_xptr, offset_sexp,
length_sexp));
- SEXP result_src = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
- if (Rf_xlength(result_src) != converter->dst.length) {
- Rf_error("Unexpected result in call to Arrow for decimal conversion");
+static inline int nanoarrow_materialize_dbl(struct RConverter* converter) {
+ if (converter->src.array_view->array->dictionary != NULL) {
+ return ENOTSUP;
}
- memcpy(REAL(converter->dst.vec_sexp) + converter->dst.offset,
REAL(result_src),
- converter->dst.length * sizeof(double));
- UNPROTECT(7);
-}
-
-static inline int nanoarrow_materialize_dbl(struct RConverter* converter) {
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
double* result = REAL(dst->vec_sexp);
@@ -110,10 +86,6 @@ static inline int nanoarrow_materialize_dbl(struct
RConverter* converter) {
}
break;
- case NANOARROW_TYPE_DECIMAL128:
- nanoarrow_materialize_decimal_to_dbl(converter);
- break;
-
default:
return EINVAL;
}
diff --git a/r/src/materialize_int.h b/r/src/materialize_int.h
index ffd39dd..60ead5a 100644
--- a/r/src/materialize_int.h
+++ b/r/src/materialize_int.h
@@ -27,6 +27,10 @@
static inline int nanoarrow_materialize_int(struct ArrayViewSlice* src,
struct VectorSlice* dst,
struct MaterializeOptions*
options) {
+ if (src->array_view->array->dictionary != NULL) {
+ return ENOTSUP;
+ }
+
int* result = INTEGER(dst->vec_sexp);
int64_t n_bad_values = 0;
diff --git a/r/src/materialize_lgl.h b/r/src/materialize_lgl.h
index a23802c..6dbfe12 100644
--- a/r/src/materialize_lgl.h
+++ b/r/src/materialize_lgl.h
@@ -26,6 +26,10 @@
static int nanoarrow_materialize_lgl(struct ArrayViewSlice* src, struct
VectorSlice* dst,
struct MaterializeOptions* options) {
+ if (src->array_view->array->dictionary != NULL) {
+ return ENOTSUP;
+ }
+
// True for all the types supported here
const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8;
const uint8_t* data_buffer = src->array_view->buffer_views[1].data.as_uint8;
diff --git a/r/src/materialize_unspecified.h b/r/src/materialize_unspecified.h
index 8f2e27a..72bcc23 100644
--- a/r/src/materialize_unspecified.h
+++ b/r/src/materialize_unspecified.h
@@ -27,6 +27,10 @@
static inline int nanoarrow_materialize_unspecified(struct ArrayViewSlice* src,
struct VectorSlice* dst,
struct MaterializeOptions*
options) {
+ if (src->array_view->array->dictionary != NULL) {
+ return ENOTSUP;
+ }
+
int* result = LOGICAL(dst->vec_sexp);
int64_t total_offset = src->array_view->array->offset + src->offset;
diff --git a/r/tests/testthat/test-convert-array.R
b/r/tests/testthat/test-convert-array.R
index 7b982b7..4481758 100644
--- a/r/tests/testthat/test-convert-array.R
+++ b/r/tests/testthat/test-convert-array.R
@@ -56,7 +56,7 @@ test_that("convert_array() errors for unsupported array", {
unsupported_array <- nanoarrow_array_init(na_interval_day_time())
expect_error(
convert_array(as_nanoarrow_array(unsupported_array)),
- "Can't infer R vector type for array <interval_day_time>"
+ "Can't infer R vector type for <interval_day_time>"
)
})
@@ -254,6 +254,16 @@ test_that("convert to vector works for null -> logical()",
{
)
})
+test_that("convert to vector works for dictionary<boolean> -> logical()", {
+ array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L))
+ array$dictionary <- as_nanoarrow_array(c(TRUE, FALSE, NA))
+
+ expect_identical(
+ convert_array(array, logical()),
+ c(TRUE, FALSE, NA, FALSE, TRUE)
+ )
+})
+
test_that("convert to vector errors for bad array to logical()", {
expect_error(
convert_array(as_nanoarrow_array(letters), logical()),
@@ -328,6 +338,16 @@ test_that("convert to vector works for null -> logical()",
{
)
})
+test_that("convert to vector works for dictionary<integer> -> integer()", {
+ array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L))
+ array$dictionary <- as_nanoarrow_array(c(123L, 0L, NA_integer_))
+
+ expect_identical(
+ convert_array(array, integer()),
+ c(123L, 0L, NA_integer_, 0L, 123L)
+ )
+})
+
test_that("convert to vector warns for invalid integer()", {
array <- as_nanoarrow_array(.Machine$integer.max + 1)
expect_warning(
@@ -409,10 +429,18 @@ test_that("convert to vector works for decimal128 ->
double()", {
skip_if_not_installed("arrow")
array <-
as_nanoarrow_array(arrow::Array$create(1:10)$cast(arrow::decimal128(20, 10)))
+
+ # Check via S3 dispatch
expect_equal(
convert_array(array, double()),
as.double(1:10)
)
+
+ # ...and via C -> S3 dispatch
+ expect_equal(
+ convert_array.default(array, double()),
+ as.double(1:10)
+ )
})
test_that("convert to vector works for null -> double()", {
@@ -426,6 +454,16 @@ test_that("convert to vector works for null -> double()", {
)
})
+test_that("convert to vector works for dictionary<double> -> double()", {
+ array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L))
+ array$dictionary <- as_nanoarrow_array(c(123, 0, NA_real_))
+
+ expect_identical(
+ convert_array(array, double()),
+ c(123, 0, NA_real_, 0, 123)
+ )
+})
+
test_that("convert to vector errors for bad array to double()", {
expect_error(
convert_array(as_nanoarrow_array(letters), double()),
@@ -463,6 +501,77 @@ test_that("convert to vector works for null ->
character()", {
)
})
+test_that("convert to vector works for dictionary<string> -> character()", {
+ array <- as_nanoarrow_array(factor(letters[5:1]))
+
+ # Via S3 dispatch
+ expect_identical(
+ convert_array(array, character()),
+ c("e", "d", "c", "b", "a")
+ )
+
+ # Via C -> S3 dispatch
+ expect_identical(
+ convert_array.default(array, character()),
+ c("e", "d", "c", "b", "a")
+ )
+})
+
+test_that("convert to vector works for dictionary<string> -> factor()", {
+ array <- as_nanoarrow_array(factor(letters[5:1]))
+
+ # With empty levels
+ expect_identical(
+ convert_array(array, factor()),
+ factor(letters[5:1])
+ )
+
+ # With identical levels
+ expect_identical(
+ convert_array(array, factor(levels = c("a", "b", "c", "d", "e"))),
+ factor(letters[5:1])
+ )
+
+ # With mismatched levels
+ expect_identical(
+ convert_array(array, factor(levels = c("b", "a", "c", "e", "d"))),
+ factor(letters[5:1], levels = c("b", "a", "c", "e", "d"))
+ )
+
+ expect_error(
+ convert_array(array, factor(levels = letters[-4])),
+ "some levels in data do not exist"
+ )
+})
+
+test_that("batched convert to vector works for dictionary<string> ->
factor()", {
+ # A slightly different path: convert_array.factor() called from C multiple
+ # times with different dictionaries each time.
+ array1 <- as_nanoarrow_array(factor(letters[1:5]))
+ array2 <- as_nanoarrow_array(factor(letters[6:10]))
+ array3 <- as_nanoarrow_array(factor(letters[11:15]))
+
+ stream <- basic_array_stream(list(array1, array2, array3))
+ expect_identical(
+ convert_array_stream(stream, factor(levels = letters)),
+ factor(letters[1:15], levels = letters)
+ )
+})
+
+test_that("batched convert to vector errors for dictionary<string> ->
factor()", {
+ # We can't currently handle a preallocate + fill style conversion where the
+ # result is partial_factor().
+ array1 <- as_nanoarrow_array(factor(letters[1:5]))
+ array2 <- as_nanoarrow_array(factor(letters[6:10]))
+ array3 <- as_nanoarrow_array(factor(letters[11:15]))
+
+ stream <- basic_array_stream(list(array1, array2, array3))
+ expect_error(
+ convert_array_stream(stream, factor()),
+ "Can't allocate ptype of class 'factor'"
+ )
+})
+
test_that("convert to vector works for blob::blob()", {
skip_if_not_installed("blob")
@@ -515,7 +624,7 @@ test_that("convert to vector works for list ->
vctrs::list_of", {
# With bad ptype
expect_error(
convert_array(array_list, vctrs::list_of(.ptype = character())),
- "Can't convert array"
+ "Can't convert `item`"
)
# With malformed ptype
@@ -552,7 +661,7 @@ test_that("convert to vector works for large_list ->
vctrs::list_of", {
# With bad ptype
expect_error(
convert_array(array_list, vctrs::list_of(.ptype = character())),
- "Can't convert array"
+ "Can't convert `item`"
)
})
@@ -581,7 +690,7 @@ test_that("convert to vector works for fixed_size_list ->
vctrs::list_of", {
# With bad ptype
expect_error(
convert_array(array_list, vctrs::list_of(.ptype = character())),
- "Can't convert array"
+ "Can't convert `item`"
)
})
@@ -798,11 +907,3 @@ test_that("convert to vector warns for stripped extension
type", {
"x: Converting unknown extension some_ext"
)
})
-
-test_that("convert to vector errors for dictionary types", {
- dict_array <- as_nanoarrow_array(factor(letters[1:5]))
- expect_error(
- convert_array(dict_array, character()),
- "Conversion to dictionary-encoded array is not supported"
- )
-})
diff --git a/r/tests/testthat/test-infer-ptype.R
b/r/tests/testthat/test-infer-ptype.R
index 920d9d8..c6516f8 100644
--- a/r/tests/testthat/test-infer-ptype.R
+++ b/r/tests/testthat/test-infer-ptype.R
@@ -123,7 +123,7 @@ test_that("infer_nanoarrow_ptype() errors for types it
can't infer", {
unsupported_array <- nanoarrow_array_init(na_decimal256(3, 4))
expect_error(
infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_array)),
- "Can't infer R vector type for array <decimal256\\(3, 4\\)>"
+ "Can't infer R vector type for <decimal256\\(3, 4\\)>"
)
unsupported_struct <- nanoarrow_array_init(