This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 4ace1a2  feat(r): Implement dictionary conversion (#285)
4ace1a2 is described below

commit 4ace1a2522f1f9f06011e88992d429eb678412fa
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Aug 23 14:24:19 2023 -0300

    feat(r): Implement dictionary conversion (#285)
    
    This PR implements conversion from dictionary-encoded arrays to R
    vectors. To make this happen, it was also necessary to refactor a few
    things about the conversion process to make writing conversions in R
    easier. This is also with an eye towards extension type conversion (up
    next) where it is expected that extension types will mostly implement
    conversions as S3 `convert_array()` method. Previously this didn't work
    if converting a stream with multiple batches...now it does (although
    will incur S3 dispatch overhead for each batch for conversions that are
    not handled in C).
    
    Conversion from a dictionary always defaults to the conversion that
    would happen from the dictionary value type. This is because in Arrow
    the dictionary is a property of the array, not the type (whereas in R, a
    factor's levels are a property of the type, sort of). For a
    dictionary-encoded array that arrives in chunks, the only type-stable
    way to perform the conversion is by resolving the dictionary into the
    value type. This also makes it a more predictable default for other
    dictionary-encoded types (none of which have an R analogue).
    
    In practice this has low memory overhead because the most commonly
    dictionary-encoded type is a string, and because R has a global string
    pool; however, it is quite a bit slower than decoding directly to
    `factor()`.
    
    Like other conversions (and like `vctrs::vec_cast()`), you can also
    request a target `ptype`. This is implemented for `factor()`. If
    converting an array stream containing dictionary-encoded values, the
    `levels` need to be explicitly specified.
    
    ``` r
    library(nanoarrow)
    
    dict_array <- as_nanoarrow_array(sample(0:9, 10, replace = TRUE))
    dict_array$dictionary <- as_nanoarrow_array(stringr::words[1:10])
    
    # default conversion is always the diciontary (value) type
    infer_nanoarrow_ptype(dict_array)
    #> character(0)
    convert_array(dict_array)
    #>  [1] "absolute" "a"        "across"   "achieve"  "accept"   "a"
    #>  [7] "active"   "a"        "a"        "across"
    
    # this makes it type stable when consuming streams
    dict1 <- as_nanoarrow_array(factor(letters[1:5]))
    dict2 <- as_nanoarrow_array(factor(letters[6:10]))
    stream <- basic_array_stream(list(dict1, dict2))
    convert_array_stream(stream)
    #>  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
    
    # ...and more consistent when dealing with other value types
    dict_array$dictionary <- 
as_nanoarrow_array(arrow::as_arrow_array(lapply(1:10, function(i) 1:5 * i)))
    infer_nanoarrow_ptype(dict_array)
    #> <list_of<integer>[0]>
    
    # If you want a factor, you can specify it explicitly
    dict_array$dictionary <- as_nanoarrow_array(stringr::words[1:10])
    convert_array(dict_array, vctrs::partial_factor())
    #>  [1] "absolute" "a"        "across"   "achieve"  "accept"   "a"
    #>  [7] "active"   "a"        "a"        "across"
    convert_array(dict_array, factor(levels = stringr::words[10:1]))
    #>  [1] absolute a        across   achieve  accept   a        active   a
    #>  [9] a        across
    #> Levels: active act across achieve account accept absolute about able a
    
    # Also works for nesting
    nested_with_dict <- as_nanoarrow_array(data.frame(x = factor(letters[1:5])))
    
    convert_array(nested_with_dict)
    #>   x
    #> 1 a
    #> 2 b
    #> 3 c
    #> 4 d
    #> 5 e
    convert_array(nested_with_dict, vctrs::partial_frame(x = factor()))
    #>   x
    #> 1 a
    #> 2 b
    #> 3 c
    #> 4 d
    #> 5 e
    ```
    
    It's hard to benchmark this because ALTREP makes the effect of some
    conversions deferred, but it seems like the performance characteristics
    are vaguely similar to what happens in Arrow (probably Arrow is faster
    for many chunks):
    
    ``` r
    library(nanoarrow)
    
    num_levels <- 1e1
    vec_len <- 1e6
    
    vec <- nanoarrow:::vec_gen(character(), n = num_levels)
    vec_chr <- do.call(c, rep(list(vec), vec_len / num_levels))
    vec_chr <- nanoarrow:::vec_shuffle(vec_chr)
    vec_fct <- factor(vec_chr)
    levels <- levels(vec_fct)
    
    array_chr <- as_nanoarrow_array(vec_chr)
    array_fct <- as_nanoarrow_array(vec_fct)
    
    arrow_chr <- arrow::as_arrow_array(array_chr)
    arrow_fct <- arrow::as_arrow_array(array_fct)
    
    bench::mark(
      convert_array(array_chr),
      as.vector(arrow_chr)
    )
    #> # A tibble: 2 × 6
    #>   expression                    min   median `itr/sec` mem_alloc `gc/sec`
    #>   <bch:expr>               <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
    #> 1 convert_array(array_chr)    2.3µs   2.71µs   328706.   23.95KB     65.8
    #> 2 as.vector(arrow_chr)       4.26µs    4.8µs   191042.    7.87KB     38.2
    
    bench::mark(
      convert_array(array_fct),
      convert_array(array_fct, factor()),
      convert_array(array_fct, factor(levels = levels)),
      as.vector(arrow_fct$cast(arrow::string())),
      as.vector(arrow_fct),
      check = FALSE
    )
    #> # A tibble: 5 × 6
    #>   expression                                             min   median 
`itr/sec`
    #>   <bch:expr>                                        <bch:tm> <bch:tm>    
 <dbl>
    #> 1 convert_array(array_fct)                           40.55ms  41.06ms    
  24.4
    #> 2 convert_array(array_fct, factor())                  2.97ms   3.18ms    
 306.
    #> 3 convert_array(array_fct, factor(levels = levels))   2.83ms   3.15ms    
 314.
    #> 4 as.vector(arrow_fct$cast(arrow::string()))         48.35ms  49.18ms    
  20.2
    #> 5 as.vector(arrow_fct)                                6.19µs   7.22µs  
110825.
    #> # ℹ 2 more variables: mem_alloc <bch:byt>, `gc/sec` <dbl>
    
    # Mitigate ALTREP
    bench::mark(
      convert_array(array_fct)[1e6:1],
      convert_array(array_fct, factor())[1e6:1],
      convert_array(array_fct, factor(levels = levels))[1e6:1],
      as.vector(arrow_fct$cast(arrow::string()))[1e6:1],
      as.vector(arrow_fct)[1e6:1],
      check = FALSE
    )
    #> # A tibble: 5 × 6
    #>   expression                                                      min   
median
    #>   <bch:expr>                                                 <bch:tm> 
<bch:tm>
    #> 1 convert_array(array_fct)[1e+06:1]                           45.87ms  
45.87ms
    #> 2 convert_array(array_fct, factor())[1e+06:1]                  5.42ms   
5.78ms
    #> 3 convert_array(array_fct, factor(levels = levels))[1e+06:1]   5.39ms   
5.79ms
    #> 4 as.vector(arrow_fct$cast(arrow::string()))[1e+06:1]         92.85ms  
94.93ms
    #> 5 as.vector(arrow_fct)[1e+06:1]                               15.42ms  
15.76ms
    #> # ℹ 3 more variables: `itr/sec` <dbl>, mem_alloc <bch:byt>, `gc/sec` 
<dbl>
    ```
    
    <sup>Created on 2023-08-23 with [reprex
    v2.0.2](https://reprex.tidyverse.org)</sup>
    
    ---------
    
    Co-authored-by: Kirill Müller <[email protected]>
---
 r/NAMESPACE                           |   2 +
 r/R/convert-array.R                   | 114 +++++++++++++++++++++++++------
 r/R/infer-ptype.R                     |   9 ++-
 r/man/as_nanoarrow_schema.Rd          |   4 +-
 r/man/convert_array.Rd                |   9 ++-
 r/src/convert.c                       |   7 --
 r/src/convert_array.c                 |  22 +++---
 r/src/materialize.c                   | 116 ++++++++++++++++++++++++++++++-
 r/src/materialize_blob.h              |   2 +-
 r/src/materialize_chr.h               |  23 +++----
 r/src/materialize_common.h            |   1 +
 r/src/materialize_date.h              |   2 +-
 r/src/materialize_dbl.h               |  34 +--------
 r/src/materialize_int.h               |   4 ++
 r/src/materialize_lgl.h               |   4 ++
 r/src/materialize_unspecified.h       |   4 ++
 r/tests/testthat/test-convert-array.R | 125 ++++++++++++++++++++++++++++++----
 r/tests/testthat/test-infer-ptype.R   |   2 +-
 18 files changed, 382 insertions(+), 102 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 51a4658..aa19def 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -43,6 +43,8 @@ S3method(as_nanoarrow_schema,Field)
 S3method(as_nanoarrow_schema,Schema)
 S3method(as_nanoarrow_schema,nanoarrow_schema)
 S3method(convert_array,default)
+S3method(convert_array,double)
+S3method(convert_array,factor)
 S3method(convert_array,vctrs_partial_frame)
 S3method(format,nanoarrow_array)
 S3method(format,nanoarrow_array_stream)
diff --git a/r/R/convert-array.R b/r/R/convert-array.R
index 97ecc14..dcf5715 100644
--- a/r/R/convert-array.R
+++ b/r/R/convert-array.R
@@ -49,7 +49,13 @@
 #' - [character()]: String and large string types can be converted to
 #'   [character()]. The conversion does not check for valid UTF-8: if you need
 #'   finer-grained control over encodings, use `to = blob::blob()`.
-#' - [Date][as.Date]: Only the date32 type can be converted to an R Date 
vector.
+#' - [factor()]: Dictionary-encoded arrays of strings can be converted to
+#'   `factor()`; however, this must be specified explicitly (i.e.,
+#'   `convert_array(array, factor())`) because arrays arriving
+#'   in chunks can have dictionaries that contain different levels. Use
+#'   `convert_array(array, factor(levels = c(...)))` to materialize an array
+#'   into a vector with known levels.
+#' - [Date][as.Date()]: Only the date32 type can be converted to an R Date 
vector.
 #' - [hms::hms()]: Time32 and time64 types can be converted to [hms::hms()].
 #' - [difftime()]: Time32, time64, and duration types can be converted to
 #'   R [difftime()] vectors. The value is converted to match the [units()]
@@ -64,8 +70,7 @@
 #'
 #' In addition to the above conversions, a null array may be converted to any
 #' target prototype except [data.frame()]. Extension arrays are currently
-#' converted as their storage type; dictionary-encoded arrays are not
-#' currently supported.
+#' converted as their storage type.
 #'
 #' @examples
 #' array <- as_nanoarrow_array(data.frame(x = 1:5))
@@ -80,6 +85,16 @@ convert_array <- function(array, to = NULL, ...) {
 #' @export
 convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
   if (.from_c) {
+    # Handle default dictionary conversion since it's the same for all types
+    dictionary <- array$dictionary
+
+    if (!is.null(dictionary)) {
+      values <- .Call(nanoarrow_c_convert_array, dictionary, to)
+      array$dictionary <- NULL
+      indices <- .Call(nanoarrow_c_convert_array, array, integer())
+      return(values[indices + 1L])
+    }
+
     stop_cant_convert_array(array, to)
   }
 
@@ -96,10 +111,45 @@ convert_array.default <- function(array, to = NULL, ..., 
.from_c = FALSE) {
 # we call convert_array() to dispatch to conversions defined via S3
 # dispatch, making sure to let the default method know that we've already
 # tried the internal C conversions.
-convert_array_from_c <- function(array, to) {
+convert_fallback_other <- function(array, offset, length, to) {
+  # If we need to modify offset/length, do it using a shallow copy.
+  if (!is.null(offset)) {
+    array <- nanoarrow_array_modify(
+      array,
+      list(offset = offset, length = length),
+      validate = FALSE
+    )
+  }
+
+  # Call convert_array() on a single chunk. Use .from_c = TRUE to ensure that
+  # methods do not attempt to pass the same array back to the C conversions.
+  # When the result is passed back to C it is checked enough to avoid segfault
+  # but not necessarily for correctness (e.g., factors with levels that don't
+  # correspond to 'to'). This result may be used as-is or may be copied into
+  # a slice of another vector.
   convert_array(array, to, .from_c = TRUE)
 }
 
+#' @export
+convert_array.double <- function(array, to, ...) {
+  # Handle conversion from decimal128 via arrow
+  schema <- infer_nanoarrow_schema(array)
+  parsed <- nanoarrow_schema_parse(schema)
+  if (parsed$type == "decimal128") {
+    assert_arrow_installed(
+      sprintf(
+        "convert %s array to object of type double",
+        nanoarrow_schema_formatted(schema)
+      )
+    )
+
+    arrow_array <- as_arrow_array.nanoarrow_array(array)
+    arrow_array$as_vector()
+  } else {
+    NextMethod()
+  }
+}
+
 #' @export
 convert_array.vctrs_partial_frame <- function(array, to, ...) {
   ptype <- infer_nanoarrow_ptype(array)
@@ -111,6 +161,45 @@ convert_array.vctrs_partial_frame <- function(array, to, 
...) {
   .Call(nanoarrow_c_convert_array, array, ptype)
 }
 
+#' @export
+convert_array.factor <- function(array, to, ...) {
+  if (!is.null(array$dictionary)) {
+    levels_final <- levels(to)
+    levels <- convert_array(array$dictionary, character())
+    array$dictionary <- NULL
+    indices <- convert_array(array, integer()) + 1L
+
+    # Handle empty factor() as the sentinel for "auto levels"
+    if (identical(levels(to), character())) {
+      levels(to) <- levels
+    }
+
+    if (identical(levels, levels(to))) {
+      fct_data <- indices
+    } else if (all(levels %in% levels(to))) {
+      level_map <- match(levels, levels(to))
+      fct_data <- level_map[indices]
+    } else {
+      stop("Error converting to factor: some levels in data do not exist in 
levels")
+    }
+  } else {
+    strings <- convert_array(array, character())
+
+    # Handle empty factor() as the sentinel for "auto levels"
+    if (identical(levels(to), character())) {
+      fct_data <- factor(strings, levels)
+      levels(to) <- levels(fct_data)
+    } else {
+      fct_data <- factor(strings, levels = levels(to))
+    }
+  }
+
+  # Restore other attributes (e.g., ordered, labels)
+  attributes(fct_data) <- attributes(to)
+  fct_data
+}
+
+
 stop_cant_convert_array <- function(array, to, n = 0) {
   stop_cant_convert_schema(infer_nanoarrow_schema(array), to, n - 1)
 }
@@ -141,20 +230,3 @@ stop_cant_convert_schema <- function(schema, to, n = 0) {
 
   stop(cnd)
 }
-
-# Called from C for decimal types
-convert_decimal_to_double <- function(array, schema, offset, length) {
-  assert_arrow_installed(
-    sprintf(
-      "convert %s array to object of type double",
-      nanoarrow_schema_formatted(schema)
-    )
-  )
-
-  array2 <- nanoarrow_allocate_array()
-  schema2 <- nanoarrow_allocate_schema()
-  nanoarrow_pointer_export(array, array2)
-  nanoarrow_pointer_export(schema, schema2)
-  arrow_array <- arrow::Array$import_from_c(array2, schema2)
-  arrow_array$Slice(offset, length)$as_vector()
-}
diff --git a/r/R/infer-ptype.R b/r/R/infer-ptype.R
index 460c4cb..3c38e2f 100644
--- a/r/R/infer-ptype.R
+++ b/r/R/infer-ptype.R
@@ -98,6 +98,13 @@ infer_ptype_other <- function(schema) {
       ptype <- infer_nanoarrow_ptype(schema$children[[1]])
       vctrs::list_of(.ptype = ptype)
     },
+    "dictionary" = {
+      # Even though R's 'factor' can handle a dictionary of strings
+      # (perhaps the most common case), an array arriving in chunks may have
+      # different dictionary arrays. Thus, the best type-stable default we can
+      # achieve is to expand dictionaries.
+      infer_nanoarrow_ptype(schema$dictionary)
+    },
     stop_cant_infer_ptype(schema, n = -1)
   )
 }
@@ -108,7 +115,7 @@ stop_cant_infer_ptype <- function(schema, n = 0) {
   if (is.null(schema$name) || identical(schema$name, "")) {
     cnd <- simpleError(
       sprintf(
-        "Can't infer R vector type for array <%s>",
+        "Can't infer R vector type for <%s>",
         schema_label
       ),
       call = sys.call(n - 1)
diff --git a/r/man/as_nanoarrow_schema.Rd b/r/man/as_nanoarrow_schema.Rd
index db32ff3..20f97f6 100644
--- a/r/man/as_nanoarrow_schema.Rd
+++ b/r/man/as_nanoarrow_schema.Rd
@@ -33,8 +33,8 @@ An object of class 'nanoarrow_schema'
 \description{
 In nanoarrow a 'schema' refers to a \verb{struct ArrowSchema} as defined in the
 Arrow C Data interface. This data structure can be used to represent an
-\code{\link[arrow:schema]{arrow::schema()}}, an 
\code{\link[arrow:Field]{arrow::field()}}, or an \code{arrow::DataType}. Note 
that
-in nanoarrow, an \code{\link[arrow:schema]{arrow::schema()}} and a 
non-nullable \code{\link[arrow:data-type]{arrow::struct()}}
+\code{\link[arrow:Schema]{arrow::schema()}}, an 
\code{\link[arrow:Field]{arrow::field()}}, or an \code{arrow::DataType}. Note 
that
+in nanoarrow, an \code{\link[arrow:Schema]{arrow::schema()}} and a 
non-nullable \code{\link[arrow:data-type]{arrow::struct()}}
 are represented identically.
 }
 \examples{
diff --git a/r/man/convert_array.Rd b/r/man/convert_array.Rd
index bb45ee9..6ed0989 100644
--- a/r/man/convert_array.Rd
+++ b/r/man/convert_array.Rd
@@ -41,6 +41,12 @@ through a floating-point double (e.g., very large uint64 and 
int64 values).
 \item \code{\link[=character]{character()}}: String and large string types can 
be converted to
 \code{\link[=character]{character()}}. The conversion does not check for valid 
UTF-8: if you need
 finer-grained control over encodings, use \code{to = blob::blob()}.
+\item \code{\link[=factor]{factor()}}: Dictionary-encoded arrays of strings 
can be converted to
+\code{factor()}; however, this must be specified explicitly (i.e.,
+\code{convert_array(array, factor())}) because arrays arriving
+in chunks can have dictionaries that contain different levels. Use
+\code{convert_array(array, factor(levels = c(...)))} to materialize an array
+into a vector with known levels.
 \item \link[=as.Date]{Date}: Only the date32 type can be converted to an R 
Date vector.
 \item \code{\link[hms:hms]{hms::hms()}}: Time32 and time64 types can be 
converted to \code{\link[hms:hms]{hms::hms()}}.
 \item \code{\link[=difftime]{difftime()}}: Time32, time64, and duration types 
can be converted to
@@ -57,8 +63,7 @@ however, a warning will be raised if any non-null values are 
encountered.
 
 In addition to the above conversions, a null array may be converted to any
 target prototype except \code{\link[=data.frame]{data.frame()}}. Extension 
arrays are currently
-converted as their storage type; dictionary-encoded arrays are not
-currently supported.
+converted as their storage type.
 }
 \examples{
 array <- as_nanoarrow_array(data.frame(x = 1:5))
diff --git a/r/src/convert.c b/r/src/convert.c
index 6590806..22bbfaa 100644
--- a/r/src/convert.c
+++ b/r/src/convert.c
@@ -309,13 +309,6 @@ int nanoarrow_converter_set_schema(SEXP converter_xptr, 
SEXP schema_xptr) {
     UNPROTECT(1);
   }
 
-  // Sub-par error for dictionary types until we have a way to deal with them
-  if (converter->schema_view.type == NANOARROW_TYPE_DICTIONARY) {
-    ArrowErrorSet(&converter->error,
-                  "Conversion to dictionary-encoded array is not supported");
-    return ENOTSUP;
-  }
-
   SET_VECTOR_ELT(converter_shelter, 1, schema_xptr);
 
   ArrowArrayViewReset(&converter->array_view);
diff --git a/r/src/convert_array.c b/r/src/convert_array.c
index 0cd66be..8bb698a 100644
--- a/r/src/convert_array.c
+++ b/r/src/convert_array.c
@@ -41,8 +41,9 @@ enum VectorType nanoarrow_infer_vector_type_array(SEXP 
array_xptr);
 // dispatch to find a convert_array() method (or error if there
 // isn't one)
 static SEXP call_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
-  SEXP fun = PROTECT(Rf_install("convert_array_from_c"));
-  SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp));
+  SEXP fun = PROTECT(Rf_install("convert_fallback_other"));
+  // offset/length don't need to be modified in this case
+  SEXP call = PROTECT(Rf_lang5(fun, array_xptr, R_NilValue, R_NilValue, 
ptype_sexp));
   SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
   UNPROTECT(3);
   return result;
@@ -100,12 +101,17 @@ static SEXP convert_array_default(SEXP array_xptr, enum 
VectorType vector_type,
 }
 
 static SEXP convert_array_chr(SEXP array_xptr) {
-  SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
-  if (result == R_NilValue) {
-    call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
+  struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
+  if (array->dictionary == NULL) {
+    SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
+    if (result == R_NilValue) {
+      call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
+    }
+    UNPROTECT(1);
+    return result;
+  } else {
+    return convert_array_default(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
   }
-  UNPROTECT(1);
-  return result;
 }
 
 SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
@@ -210,7 +216,7 @@ SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP 
ptype_sexp) {
                Rf_inherits(ptype_sexp, "Date") || Rf_inherits(ptype_sexp, 
"hms") ||
                Rf_inherits(ptype_sexp, "POSIXct") ||
                Rf_inherits(ptype_sexp, "difftime")) {
-      return convert_array_default(array_xptr, VECTOR_TYPE_OTHER, ptype_sexp);
+      return convert_array_default(array_xptr, VECTOR_TYPE_UNINITIALIZED, 
ptype_sexp);
     } else {
       return call_convert_array(array_xptr, ptype_sexp);
     }
diff --git a/r/src/materialize.c b/r/src/materialize.c
index 48b8c32..e4c8db7 100644
--- a/r/src/materialize.c
+++ b/r/src/materialize.c
@@ -21,6 +21,8 @@
 
 #include "nanoarrow.h"
 
+#include "util.h"
+
 // Needed for the list_of materializer
 #include "convert.h"
 
@@ -109,6 +111,16 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t 
len) {
   SEXP result;
 
   if (Rf_isObject(ptype)) {
+    // There may be a more accurate test that more precisely captures the case
+    // where a user has specified a valid ptype that doesn't work in a 
preallocate
+    // + fill conversion.
+    if (Rf_inherits(ptype, "factor")) {
+      SEXP levels = Rf_getAttrib(ptype, R_LevelsSymbol);
+      if (Rf_length(levels) == 0) {
+        Rf_error("Can't allocate ptype of class 'factor' with empty levels");
+      }
+    }
+
     if (nanoarrow_ptype_is_data_frame(ptype)) {
       R_xlen_t num_cols = Rf_xlength(ptype);
       result = PROTECT(Rf_allocVector(VECSXP, num_cols));
@@ -178,12 +190,95 @@ static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, 
R_xlen_t len) {
   }
 }
 
+static int nanoarrow_materialize_other(struct RConverter* converter,
+                                       SEXP converter_xptr) {
+  // Ensure that we have a ptype SEXP to send in the call back to R
+  if (converter->ptype_view.ptype == R_NilValue) {
+    SEXP ptype = 
PROTECT(nanoarrow_alloc_type(converter->ptype_view.vector_type, 0));
+    converter->ptype_view.ptype = ptype;
+    SET_VECTOR_ELT(R_ExternalPtrProtected(converter_xptr), 0, ptype);
+    UNPROTECT(1);
+  }
+
+  // A unique situation where we don't want owning external pointers because 
we know
+  // these are protected for the duration of our call into R and because we 
don't want
+  // the underlying array to be released and invalidate the converter. The R 
code in
+  // convert_fallback_other() takes care of ensuring an independent copy with 
the correct
+  // offset/length.
+  SEXP schema_xptr =
+      PROTECT(R_MakeExternalPtr(converter->schema_view.schema, R_NilValue, 
R_NilValue));
+  Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
+  // We do need to set the protected member of the array external pointer to 
signal that
+  // it is not an independent array (i.e., force a shallow copy).
+  SEXP array_xptr = PROTECT(
+      R_MakeExternalPtr(converter->array_view.array, schema_xptr, 
converter_xptr));
+  Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
+
+  SEXP offset_sexp =
+      PROTECT(Rf_ScalarReal(converter->src.array_view->offset + 
converter->src.offset));
+  SEXP length_sexp = PROTECT(Rf_ScalarReal(converter->src.length));
+
+  SEXP fun = PROTECT(Rf_install("convert_fallback_other"));
+  SEXP call = PROTECT(
+      Rf_lang5(fun, array_xptr, offset_sexp, length_sexp, 
converter->ptype_view.ptype));
+  SEXP result_src = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
+
+  // Currently this method can only handle the case where result_src and dst 
have the same
+  // SEXP type and length. This won't work for a data frame/record array-like 
result.
+  if (Rf_xlength(result_src) != converter->dst.length) {
+    Rf_error("Unexpected length in result of 
nanoarrow:::convert_fallback_other()");
+  }
+
+  if (TYPEOF(result_src) != TYPEOF(converter->dst.vec_sexp)) {
+    Rf_error("Unexpected SEXP type in result of 
nanoarrow:::convert_fallback_other()");
+  }
+
+  switch (TYPEOF(result_src)) {
+    case REALSXP:
+      memcpy(REAL(converter->dst.vec_sexp) + converter->dst.offset, 
REAL(result_src),
+             converter->dst.length * sizeof(double));
+      break;
+    case INTSXP:
+    case LGLSXP:
+      memcpy(INTEGER(converter->dst.vec_sexp) + converter->dst.offset,
+             INTEGER(result_src), converter->dst.length * sizeof(int));
+      break;
+    case STRSXP:
+      for (R_xlen_t i = 0; i < converter->dst.length; i++) {
+        SET_STRING_ELT(converter->dst.vec_sexp, converter->dst.offset + i,
+                       STRING_ELT(result_src, i));
+      }
+      break;
+    case VECSXP:
+      for (R_xlen_t i = 0; i < converter->dst.length; i++) {
+        SET_VECTOR_ELT(converter->dst.vec_sexp, converter->dst.offset + i,
+                       VECTOR_ELT(result_src, i));
+      }
+      break;
+    case NILSXP:
+      // Do nothing if the function returned NULL
+      break;
+    default:
+      Rf_error(
+          "Unhandled SEXP type in conversion of 
nanoarrow:::convert_fallback_other()");
+      break;
+  }
+
+  UNPROTECT(7);
+  return NANOARROW_OK;
+}
+
 static int nanoarrow_materialize_data_frame(struct RConverter* converter,
                                             SEXP converter_xptr) {
   if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) {
     return EINVAL;
   }
 
+  // Make sure we error for dictionary types
+  if (converter->src.array_view->array->dictionary != NULL) {
+    return EINVAL;
+  }
+
   SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
   SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
 
@@ -256,6 +351,11 @@ static int nanoarrow_materialize_list_of(struct 
RConverter* converter,
   struct ArrayViewSlice* src = &converter->src;
   struct VectorSlice* dst = &converter->dst;
 
+  // Make sure we error for dictionary types
+  if (src->array_view->array->dictionary != NULL) {
+    return EINVAL;
+  }
+
   const int32_t* offsets = src->array_view->buffer_views[1].data.as_int32;
   const int64_t* large_offsets = 
src->array_view->buffer_views[1].data.as_int64;
   int64_t raw_src_offset = src->array_view->array->offset + src->offset;
@@ -309,7 +409,7 @@ static int nanoarrow_materialize_list_of(struct RConverter* 
converter,
   return NANOARROW_OK;
 }
 
-int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) {
+static int nanoarrow_materialize_base(struct RConverter* converter, SEXP 
converter_xptr) {
   struct ArrayViewSlice* src = &converter->src;
   struct VectorSlice* dst = &converter->dst;
   struct MaterializeOptions* options = converter->options;
@@ -324,7 +424,7 @@ int nanoarrow_materialize(struct RConverter* converter, 
SEXP converter_xptr) {
     case VECTOR_TYPE_DBL:
       return nanoarrow_materialize_dbl(converter);
     case VECTOR_TYPE_CHR:
-      return nanoarrow_materialize_chr(src, dst, options);
+      return nanoarrow_materialize_chr(converter);
     case VECTOR_TYPE_POSIXCT:
       return nanoarrow_materialize_posixct(converter);
     case VECTOR_TYPE_DATE:
@@ -338,6 +438,16 @@ int nanoarrow_materialize(struct RConverter* converter, 
SEXP converter_xptr) {
     case VECTOR_TYPE_DATA_FRAME:
       return nanoarrow_materialize_data_frame(converter, converter_xptr);
     default:
-      return ENOTSUP;
+      return nanoarrow_materialize_other(converter, converter_xptr);
+  }
+}
+
+int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) {
+  int result = nanoarrow_materialize_base(converter, converter_xptr);
+
+  if (result != NANOARROW_OK) {
+    return nanoarrow_materialize_other(converter, converter_xptr);
+  } else {
+    return NANOARROW_OK;
   }
 }
diff --git a/r/src/materialize_blob.h b/r/src/materialize_blob.h
index b9919e8..eb6faba 100644
--- a/r/src/materialize_blob.h
+++ b/r/src/materialize_blob.h
@@ -35,7 +35,7 @@ static inline int nanoarrow_materialize_blob(struct 
ArrayViewSlice* src,
     case NANOARROW_TYPE_LARGE_BINARY:
       break;
     default:
-      return EINVAL;
+      return ENOTSUP;
   }
 
   if (src->array_view->storage_type == NANOARROW_TYPE_NA) {
diff --git a/r/src/materialize_chr.h b/r/src/materialize_chr.h
index b8f1beb..2d57fdc 100644
--- a/r/src/materialize_chr.h
+++ b/r/src/materialize_chr.h
@@ -24,24 +24,23 @@
 #include "materialize_common.h"
 #include "nanoarrow.h"
 
-static inline int nanoarrow_materialize_chr(struct ArrayViewSlice* src,
-                                            struct VectorSlice* dst,
-                                            struct MaterializeOptions* 
options) {
+static inline int nanoarrow_materialize_chr(struct RConverter* converter) {
+  struct ArrayViewSlice* src = &converter->src;
+  struct VectorSlice* dst = &converter->dst;
+
   switch (src->array_view->storage_type) {
     case NANOARROW_TYPE_NA:
+      for (R_xlen_t i = 0; i < dst->length; i++) {
+        SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING);
+      }
+      return NANOARROW_OK;
+
     case NANOARROW_TYPE_STRING:
     case NANOARROW_TYPE_LARGE_STRING:
       break;
-    default:
-      return EINVAL;
-  }
 
-  if (src->array_view->storage_type == NANOARROW_TYPE_NA) {
-    for (R_xlen_t i = 0; i < dst->length; i++) {
-      SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING);
-    }
-
-    return NANOARROW_OK;
+    default:
+      return ENOTSUP;
   }
 
   struct ArrowStringView item;
diff --git a/r/src/materialize_common.h b/r/src/materialize_common.h
index e084f05..22f74c5 100644
--- a/r/src/materialize_common.h
+++ b/r/src/materialize_common.h
@@ -27,6 +27,7 @@
 // or looping at the R level. Some of these types also need an SEXP ptype to 
communicate
 // additional information.
 enum VectorType {
+  VECTOR_TYPE_UNINITIALIZED,
   VECTOR_TYPE_NULL,
   VECTOR_TYPE_UNSPECIFIED,
   VECTOR_TYPE_LGL,
diff --git a/r/src/materialize_date.h b/r/src/materialize_date.h
index 9995e0e..925c895 100644
--- a/r/src/materialize_date.h
+++ b/r/src/materialize_date.h
@@ -37,7 +37,7 @@ static int nanoarrow_materialize_date(struct RConverter* 
converter) {
     }
   }
 
-  return EINVAL;
+  return ENOTSUP;
 }
 
 #endif
diff --git a/r/src/materialize_dbl.h b/r/src/materialize_dbl.h
index 865ef4a..3bebe47 100644
--- a/r/src/materialize_dbl.h
+++ b/r/src/materialize_dbl.h
@@ -23,36 +23,12 @@
 
 #include "materialize_common.h"
 #include "nanoarrow.h"
-#include "util.h"
 
-// Fall back to arrow for decimal conversion via a package helper
-static inline void nanoarrow_materialize_decimal_to_dbl(struct RConverter* 
converter) {
-  // A unique situation where we don't want owning external pointers because 
we know
-  // these are protected for the duration of our call into R and because we 
don't want
-  // then to be garbage collected and invalidate the converter
-  SEXP array_xptr =
-      PROTECT(R_MakeExternalPtr(converter->array_view.array, R_NilValue, 
R_NilValue));
-  Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
-  SEXP schema_xptr =
-      PROTECT(R_MakeExternalPtr(converter->schema_view.schema, R_NilValue, 
R_NilValue));
-  Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
-
-  SEXP offset_sexp = PROTECT(Rf_ScalarReal(converter->src.offset));
-  SEXP length_sexp = PROTECT(Rf_ScalarReal(converter->src.length));
-
-  SEXP fun = PROTECT(Rf_install("convert_decimal_to_double"));
-  SEXP call = PROTECT(Rf_lang5(fun, array_xptr, schema_xptr, offset_sexp, 
length_sexp));
-  SEXP result_src = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
-  if (Rf_xlength(result_src) != converter->dst.length) {
-    Rf_error("Unexpected result in call to Arrow for decimal conversion");
+static inline int nanoarrow_materialize_dbl(struct RConverter* converter) {
+  if (converter->src.array_view->array->dictionary != NULL) {
+    return ENOTSUP;
   }
 
-  memcpy(REAL(converter->dst.vec_sexp) + converter->dst.offset, 
REAL(result_src),
-         converter->dst.length * sizeof(double));
-  UNPROTECT(7);
-}
-
-static inline int nanoarrow_materialize_dbl(struct RConverter* converter) {
   struct ArrayViewSlice* src = &converter->src;
   struct VectorSlice* dst = &converter->dst;
   double* result = REAL(dst->vec_sexp);
@@ -110,10 +86,6 @@ static inline int nanoarrow_materialize_dbl(struct 
RConverter* converter) {
       }
       break;
 
-    case NANOARROW_TYPE_DECIMAL128:
-      nanoarrow_materialize_decimal_to_dbl(converter);
-      break;
-
     default:
       return EINVAL;
   }
diff --git a/r/src/materialize_int.h b/r/src/materialize_int.h
index ffd39dd..60ead5a 100644
--- a/r/src/materialize_int.h
+++ b/r/src/materialize_int.h
@@ -27,6 +27,10 @@
 static inline int nanoarrow_materialize_int(struct ArrayViewSlice* src,
                                             struct VectorSlice* dst,
                                             struct MaterializeOptions* 
options) {
+  if (src->array_view->array->dictionary != NULL) {
+    return ENOTSUP;
+  }
+
   int* result = INTEGER(dst->vec_sexp);
   int64_t n_bad_values = 0;
 
diff --git a/r/src/materialize_lgl.h b/r/src/materialize_lgl.h
index a23802c..6dbfe12 100644
--- a/r/src/materialize_lgl.h
+++ b/r/src/materialize_lgl.h
@@ -26,6 +26,10 @@
 
 static int nanoarrow_materialize_lgl(struct ArrayViewSlice* src, struct 
VectorSlice* dst,
                                      struct MaterializeOptions* options) {
+  if (src->array_view->array->dictionary != NULL) {
+    return ENOTSUP;
+  }
+
   // True for all the types supported here
   const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8;
   const uint8_t* data_buffer = src->array_view->buffer_views[1].data.as_uint8;
diff --git a/r/src/materialize_unspecified.h b/r/src/materialize_unspecified.h
index 8f2e27a..72bcc23 100644
--- a/r/src/materialize_unspecified.h
+++ b/r/src/materialize_unspecified.h
@@ -27,6 +27,10 @@
 static inline int nanoarrow_materialize_unspecified(struct ArrayViewSlice* src,
                                                     struct VectorSlice* dst,
                                                     struct MaterializeOptions* 
options) {
+  if (src->array_view->array->dictionary != NULL) {
+    return ENOTSUP;
+  }
+
   int* result = LOGICAL(dst->vec_sexp);
 
   int64_t total_offset = src->array_view->array->offset + src->offset;
diff --git a/r/tests/testthat/test-convert-array.R 
b/r/tests/testthat/test-convert-array.R
index 7b982b7..4481758 100644
--- a/r/tests/testthat/test-convert-array.R
+++ b/r/tests/testthat/test-convert-array.R
@@ -56,7 +56,7 @@ test_that("convert_array() errors for unsupported array", {
   unsupported_array <- nanoarrow_array_init(na_interval_day_time())
   expect_error(
     convert_array(as_nanoarrow_array(unsupported_array)),
-    "Can't infer R vector type for array <interval_day_time>"
+    "Can't infer R vector type for <interval_day_time>"
   )
 })
 
@@ -254,6 +254,16 @@ test_that("convert to vector works for null -> logical()", 
{
   )
 })
 
+test_that("convert to vector works for dictionary<boolean> -> logical()", {
+  array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L))
+  array$dictionary <- as_nanoarrow_array(c(TRUE, FALSE, NA))
+
+  expect_identical(
+    convert_array(array, logical()),
+    c(TRUE, FALSE, NA, FALSE, TRUE)
+  )
+})
+
 test_that("convert to vector errors for bad array to logical()", {
   expect_error(
     convert_array(as_nanoarrow_array(letters), logical()),
@@ -328,6 +338,16 @@ test_that("convert to vector works for null -> logical()", 
{
   )
 })
 
+test_that("convert to vector works for dictionary<integer> -> integer()", {
+  array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L))
+  array$dictionary <- as_nanoarrow_array(c(123L, 0L,  NA_integer_))
+
+  expect_identical(
+    convert_array(array, integer()),
+    c(123L, 0L, NA_integer_, 0L, 123L)
+  )
+})
+
 test_that("convert to vector warns for invalid integer()", {
   array <- as_nanoarrow_array(.Machine$integer.max + 1)
   expect_warning(
@@ -409,10 +429,18 @@ test_that("convert to vector works for decimal128 -> 
double()", {
   skip_if_not_installed("arrow")
 
   array <- 
as_nanoarrow_array(arrow::Array$create(1:10)$cast(arrow::decimal128(20, 10)))
+
+  # Check via S3 dispatch
   expect_equal(
     convert_array(array, double()),
     as.double(1:10)
   )
+
+  # ...and via C -> S3 dispatch
+  expect_equal(
+    convert_array.default(array, double()),
+    as.double(1:10)
+  )
 })
 
 test_that("convert to vector works for null -> double()", {
@@ -426,6 +454,16 @@ test_that("convert to vector works for null -> double()", {
   )
 })
 
+test_that("convert to vector works for dictionary<double> -> double()", {
+  array <- as_nanoarrow_array(c(0L, 1L, 2L, 1L, 0L))
+  array$dictionary <- as_nanoarrow_array(c(123, 0,  NA_real_))
+
+  expect_identical(
+    convert_array(array, double()),
+    c(123, 0, NA_real_, 0, 123)
+  )
+})
+
 test_that("convert to vector errors for bad array to double()", {
   expect_error(
     convert_array(as_nanoarrow_array(letters), double()),
@@ -463,6 +501,77 @@ test_that("convert to vector works for null -> 
character()", {
   )
 })
 
+test_that("convert to vector works for dictionary<string> -> character()", {
+  array <- as_nanoarrow_array(factor(letters[5:1]))
+
+  # Via S3 dispatch
+  expect_identical(
+    convert_array(array, character()),
+    c("e", "d", "c", "b", "a")
+  )
+
+  # Via C -> S3 dispatch
+  expect_identical(
+    convert_array.default(array, character()),
+    c("e", "d", "c", "b", "a")
+  )
+})
+
+test_that("convert to vector works for dictionary<string> -> factor()", {
+  array <- as_nanoarrow_array(factor(letters[5:1]))
+
+  # With empty levels
+  expect_identical(
+    convert_array(array, factor()),
+    factor(letters[5:1])
+  )
+
+  # With identical levels
+  expect_identical(
+    convert_array(array, factor(levels = c("a", "b", "c", "d", "e"))),
+    factor(letters[5:1])
+  )
+
+  # With mismatched levels
+  expect_identical(
+    convert_array(array, factor(levels = c("b", "a", "c", "e", "d"))),
+    factor(letters[5:1], levels = c("b", "a", "c", "e", "d"))
+  )
+
+  expect_error(
+    convert_array(array, factor(levels = letters[-4])),
+    "some levels in data do not exist"
+  )
+})
+
+test_that("batched convert to vector works for dictionary<string> -> 
factor()", {
+  # A slightly different path: convert_array.factor() called from C multiple
+  # times with different dictionaries each time.
+  array1 <- as_nanoarrow_array(factor(letters[1:5]))
+  array2 <- as_nanoarrow_array(factor(letters[6:10]))
+  array3 <- as_nanoarrow_array(factor(letters[11:15]))
+
+  stream <- basic_array_stream(list(array1, array2, array3))
+  expect_identical(
+    convert_array_stream(stream, factor(levels = letters)),
+    factor(letters[1:15], levels = letters)
+  )
+})
+
+test_that("batched convert to vector errors for dictionary<string> -> 
factor()", {
+  # We can't currently handle a preallocate + fill style conversion where the
+  # result is partial_factor().
+  array1 <- as_nanoarrow_array(factor(letters[1:5]))
+  array2 <- as_nanoarrow_array(factor(letters[6:10]))
+  array3 <- as_nanoarrow_array(factor(letters[11:15]))
+
+  stream <- basic_array_stream(list(array1, array2, array3))
+  expect_error(
+    convert_array_stream(stream, factor()),
+    "Can't allocate ptype of class 'factor'"
+  )
+})
+
 test_that("convert to vector works for blob::blob()", {
   skip_if_not_installed("blob")
 
@@ -515,7 +624,7 @@ test_that("convert to vector works for list -> 
vctrs::list_of", {
   # With bad ptype
   expect_error(
     convert_array(array_list, vctrs::list_of(.ptype = character())),
-    "Can't convert array"
+    "Can't convert `item`"
   )
 
   # With malformed ptype
@@ -552,7 +661,7 @@ test_that("convert to vector works for large_list -> 
vctrs::list_of", {
   # With bad ptype
   expect_error(
     convert_array(array_list, vctrs::list_of(.ptype = character())),
-    "Can't convert array"
+    "Can't convert `item`"
   )
 })
 
@@ -581,7 +690,7 @@ test_that("convert to vector works for fixed_size_list -> 
vctrs::list_of", {
   # With bad ptype
   expect_error(
     convert_array(array_list, vctrs::list_of(.ptype = character())),
-    "Can't convert array"
+    "Can't convert `item`"
   )
 })
 
@@ -798,11 +907,3 @@ test_that("convert to vector warns for stripped extension 
type", {
     "x: Converting unknown extension some_ext"
   )
 })
-
-test_that("convert to vector errors for dictionary types", {
-  dict_array <- as_nanoarrow_array(factor(letters[1:5]))
-  expect_error(
-    convert_array(dict_array, character()),
-    "Conversion to dictionary-encoded array is not supported"
-  )
-})
diff --git a/r/tests/testthat/test-infer-ptype.R 
b/r/tests/testthat/test-infer-ptype.R
index 920d9d8..c6516f8 100644
--- a/r/tests/testthat/test-infer-ptype.R
+++ b/r/tests/testthat/test-infer-ptype.R
@@ -123,7 +123,7 @@ test_that("infer_nanoarrow_ptype() errors for types it 
can't infer",  {
   unsupported_array <- nanoarrow_array_init(na_decimal256(3, 4))
   expect_error(
     infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_array)),
-    "Can't infer R vector type for array <decimal256\\(3, 4\\)>"
+    "Can't infer R vector type for <decimal256\\(3, 4\\)>"
   )
 
   unsupported_struct <- nanoarrow_array_init(


Reply via email to