This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1db7f905 feat(r): Add experimental `nanoarrow_vctr` to wrap a list of
arrays (#461)
1db7f905 is described below
commit 1db7f905d5a2904d46070545f21017f08cfaf280
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed May 15 15:20:49 2024 -0300
feat(r): Add experimental `nanoarrow_vctr` to wrap a list of arrays (#461)
This PR adds the `nanoarrow_vctr`, which is an R translation of the
Python `Array` class in nanoarrow's Python bindings. This is implemented
like an R `factor()` in the sense that under the hood it is a sequence
of integers (`0:(array$length - 1)` at the beginning) with attributes
that give those integers context.
This is implemented in such a way that it is "tacked on" to the existing
conversions. The existing conversions do need a refactoring (
https://github.com/apache/arrow-nanoarrow/pull/392 ), but that is a
heavy change for this point in the release cycle.
The only change needed to the existing conversion was a slight refactor
of the "consume array stream" code that correctly gave each array in the
stream its own R object to manage its lifecycle (before each array was
"materialized" and then immediately released because no previous
conversion code required an ArrowArray to live beyond the conversion.
The motivation for this change is converting GeoArrow extension types.
In the geoarrow package, we implement an efficient conversion from a
stream of arrays to various types of R-spatial objects (e.g., sf);
however, we really don't want to invoke the default conversion for those
types because they have awful performance (e.g., the multipolygon would
be a `list(list(list(data.frame))))`) and there's no need to invoke that
number of R object conversions between the initial state (an arrow
array) and the final state (an sfc column). The nanoarrow_vctr allows
something like:
```r
df <- convert_array(some_array_containing_a_geoarrow_col)
st_as_sfc(df$geometry) # or s2::as_s2_geography(df$geometry), or something
else
```
A side-effect of this change is that we have an escape hatch for
conversions that are lossy or contain types with no R equivalent.
A quick demo:
``` r
library(nanoarrow)
arrays <- lapply(
list(1:5, 6:10, 11:13),
as_nanoarrow_array
)
# A vctr can be created from any stream
(vctr <- as_nanoarrow_vctr(basic_array_stream(arrays)))
#> <nanoarrow_vctr int32[13]>
#> [1] 1 2 3 4 5 6 7 8 9 10 11 12 13
# Under the hood this is something like a factor() where levels are
# a list of arrays with cached offsets. This is like an Arrow ChunkedArray
str(vctr)
#> <nanoarrow_vctr int32[13]>
#> List of 3
#> $ :<nanoarrow_array int32[5]>
#> ..$ length : int 5
#> ..$ null_count: int 0
#> ..$ offset : int 0
#> ..$ buffers :List of 2
#> .. ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
#> .. ..$ :<nanoarrow_buffer data<int32>[5][20 b]> `1 2 3 4 5`
#> ..$ dictionary: NULL
#> ..$ children : list()
#> $ :<nanoarrow_array int32[5]>
#> ..$ length : int 5
#> ..$ null_count: int 0
#> ..$ offset : int 0
#> ..$ buffers :List of 2
#> .. ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
#> .. ..$ :<nanoarrow_buffer data<int32>[5][20 b]> `6 7 8 9 10`
#> ..$ dictionary: NULL
#> ..$ children : list()
#> $ :<nanoarrow_array int32[3]>
#> ..$ length : int 3
#> ..$ null_count: int 0
#> ..$ offset : int 0
#> ..$ buffers :List of 2
#> .. ..$ :<nanoarrow_buffer validity<bool>[0][0 b]> ``
#> .. ..$ :<nanoarrow_buffer data<int32>[3][12 b]> `11 12 13`
#> ..$ dictionary: NULL
#> ..$ children : list()
# vctrs can be sliced:
head(vctr)
#> <nanoarrow_vctr int32[6]>
#> [1] 1 2 3 4 5 6
# ...and can live in a data.frame
head(tibble::tibble(x = vctr))
#> # A tibble: 6 × 1
#> x
#> <nnrrw_vc>
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
#> 6 6
# They can be used as zero-copy conversion targets
array <- as_nanoarrow_array(1:5)
convert_array(array, nanoarrow_vctr())
#> <nanoarrow_vctr int32[5]>
#> [1] 1 2 3 4 5
# ...also works in a nested ptype
array <- as_nanoarrow_array(data.frame(x = 1:5))
convert_array(array, tibble::tibble(x = nanoarrow_vctr()))
#> # A tibble: 5 × 1
#> x
#> <nnrrw_vc>
#> 1 1
#> 2 2
#> 3 3
#> 4 4
#> 5 5
# For nested list output, it will give a slice of the original array for
# each list item
array <- as_nanoarrow_array(
list(1:5, 6:10, NULL, 11:13),
schema = na_list(na_int32())
)
(lst_of <- convert_array(array, vctrs::list_of(nanoarrow_vctr())))
#> <list_of<nanoarrow_vctr>[4]>
#> [[1]]
#> <nanoarrow_vctr int32[5]>
#> [1] 1 2 3 4 5
#>
#> [[2]]
#> <nanoarrow_vctr int32[5]>
#> [1] 6 7 8 9 10
#>
#> [[3]]
#> NULL
#>
#> [[4]]
#> <nanoarrow_vctr int32[3]>
#> [1] 11 12 13
for (i in seq_along(lst_of)) {
array <- attr(lst_of[[i]], "chunks")[[1]]
cat(sprintf("offset: %d, length: %d\n", array$offset, array$length))
}
#> offset: 0, length: 5
#> offset: 5, length: 5
#> offset: 10, length: 3
```
<sup>Created on 2024-05-10 with [reprex
v2.1.0](https://reprex.tidyverse.org)</sup>
---
r/NAMESPACE | 14 ++
r/R/convert-array.R | 10 +
r/R/vctr.R | 343 ++++++++++++++++++++++++++++++++++
r/man/as_nanoarrow_vctr.Rd | 56 ++++++
r/man/nanoarrow-package.Rd | 1 +
r/src/convert.c | 10 +-
r/src/convert_array_stream.c | 81 ++++----
r/src/init.c | 6 +
r/src/materialize.c | 161 ++++++++++++++--
r/src/materialize.h | 4 +
r/src/vctr.c | 121 ++++++++++++
r/tests/testthat/test-convert-array.R | 115 ++++++++++++
r/tests/testthat/test-extension.R | 37 ++++
r/tests/testthat/test-vctr.R | 237 +++++++++++++++++++++++
src/nanoarrow/buffer_inline.h | 21 +++
src/nanoarrow/utils_test.cc | 12 ++
16 files changed, 1172 insertions(+), 57 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index d868f0b2..aa6e71a0 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -6,14 +6,18 @@ S3method("$",nanoarrow_buffer)
S3method("$",nanoarrow_schema)
S3method("$<-",nanoarrow_array)
S3method("$<-",nanoarrow_schema)
+S3method("[",nanoarrow_vctr)
+S3method("[<-",nanoarrow_vctr)
S3method("[[",nanoarrow_array)
S3method("[[",nanoarrow_array_stream)
S3method("[[",nanoarrow_buffer)
S3method("[[",nanoarrow_schema)
S3method("[[<-",nanoarrow_array)
S3method("[[<-",nanoarrow_schema)
+S3method("[[<-",nanoarrow_vctr)
S3method(as.data.frame,nanoarrow_array)
S3method(as.data.frame,nanoarrow_array_stream)
+S3method(as.data.frame,nanoarrow_vctr)
S3method(as.raw,nanoarrow_buffer)
S3method(as.vector,nanoarrow_array)
S3method(as.vector,nanoarrow_array_stream)
@@ -48,15 +52,19 @@ S3method(as_nanoarrow_array_stream,data.frame)
S3method(as_nanoarrow_array_stream,default)
S3method(as_nanoarrow_array_stream,nanoarrow_array)
S3method(as_nanoarrow_array_stream,nanoarrow_array_stream)
+S3method(as_nanoarrow_array_stream,nanoarrow_vctr)
S3method(as_nanoarrow_buffer,default)
S3method(as_nanoarrow_buffer,nanoarrow_buffer)
S3method(as_nanoarrow_schema,DataType)
S3method(as_nanoarrow_schema,Field)
S3method(as_nanoarrow_schema,Schema)
S3method(as_nanoarrow_schema,nanoarrow_schema)
+S3method(as_nanoarrow_schema,nanoarrow_vctr)
+S3method(c,nanoarrow_vctr)
S3method(convert_array,default)
S3method(convert_array,double)
S3method(convert_array,factor)
+S3method(convert_array,nanoarrow_vctr)
S3method(convert_array,vctrs_partial_frame)
S3method(convert_array_extension,default)
S3method(convert_array_extension,nanoarrow_extension_spec_vctrs)
@@ -64,6 +72,7 @@ S3method(format,nanoarrow_array)
S3method(format,nanoarrow_array_stream)
S3method(format,nanoarrow_buffer)
S3method(format,nanoarrow_schema)
+S3method(format,nanoarrow_vctr)
S3method(infer_nanoarrow_ptype_extension,default)
S3method(infer_nanoarrow_ptype_extension,nanoarrow_extension_spec_vctrs)
S3method(infer_nanoarrow_schema,Array)
@@ -93,6 +102,7 @@ S3method(infer_nanoarrow_schema,list)
S3method(infer_nanoarrow_schema,logical)
S3method(infer_nanoarrow_schema,nanoarrow_array)
S3method(infer_nanoarrow_schema,nanoarrow_array_stream)
+S3method(infer_nanoarrow_schema,nanoarrow_vctr)
S3method(infer_nanoarrow_schema,raw)
S3method(infer_nanoarrow_schema,vctrs_list_of)
S3method(infer_nanoarrow_schema,vctrs_unspecified)
@@ -108,6 +118,7 @@ S3method(print,nanoarrow_array)
S3method(print,nanoarrow_array_stream)
S3method(print,nanoarrow_buffer)
S3method(print,nanoarrow_schema)
+S3method(print,nanoarrow_vctr)
S3method(read_nanoarrow,character)
S3method(read_nanoarrow,connection)
S3method(read_nanoarrow,raw)
@@ -115,12 +126,14 @@ S3method(str,nanoarrow_array)
S3method(str,nanoarrow_array_stream)
S3method(str,nanoarrow_buffer)
S3method(str,nanoarrow_schema)
+S3method(str,nanoarrow_vctr)
export(array_stream_set_finalizer)
export(as_nanoarrow_array)
export(as_nanoarrow_array_extension)
export(as_nanoarrow_array_stream)
export(as_nanoarrow_buffer)
export(as_nanoarrow_schema)
+export(as_nanoarrow_vctr)
export(basic_array_stream)
export(collect_array_stream)
export(convert_array)
@@ -191,6 +204,7 @@ export(nanoarrow_pointer_release)
export(nanoarrow_pointer_set_protected)
export(nanoarrow_schema_modify)
export(nanoarrow_schema_parse)
+export(nanoarrow_vctr)
export(nanoarrow_version)
export(read_nanoarrow)
export(register_nanoarrow_extension)
diff --git a/r/R/convert-array.R b/r/R/convert-array.R
index 6de82d17..a8dbbb10 100644
--- a/r/R/convert-array.R
+++ b/r/R/convert-array.R
@@ -139,6 +139,16 @@ convert_fallback_other <- function(array, offset, length,
to) {
convert_array(array, to, .from_c = TRUE)
}
+#' @export
+convert_array.nanoarrow_vctr <- function(array, to, ...) {
+ schema <- attr(to, "schema", exact = TRUE)
+ if (is.null(schema)) {
+ schema <- infer_nanoarrow_schema(array)
+ }
+
+ new_nanoarrow_vctr(list(array), schema, class(to))
+}
+
#' @export
convert_array.double <- function(array, to, ...) {
# Handle conversion from decimal128 via arrow
diff --git a/r/R/vctr.R b/r/R/vctr.R
new file mode 100644
index 00000000..4d1c443b
--- /dev/null
+++ b/r/R/vctr.R
@@ -0,0 +1,343 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Experimental Arrow encoded arrays as R vectors
+#'
+#' This experimental vctr class allows zero or more Arrow arrays to
+#' present as an R vector without converting them. This is useful for arrays
+#' with types that do not have a non-lossy R equivalent, and helps provide an
+#' intermediary object type where the default conversion is prohibitively
+#' expensive (e.g., a nested list of data frames). These objects will not
+#' survive many vctr transformations; however, they can be sliced without
+#' copying the underlying arrays.
+#'
+#' The nanoarrow_vctr is currently implemented similarly to `factor()`: its
+#' storage type is an `integer()` that is a sequence along the total length
+#' of the vctr and there are attributes that are required to resolve these
+#' indices to an array + offset. Sequences typically have a very compact
+#' representation in recent versions of R such that this has a cheap storage
+#' footprint even for large arrays. The attributes are currently:
+#'
+#' - `schema`: The [nanoarrow_schema][as_nanoarrow_schema] shared by each
chunk.
+#' - `chunks`: A `list()` of `nanoarrow_array`.
+#' - `offsets`: An `integer()` vector beginning with `0` and followed by the
+#' cumulative length of each chunk. This allows the chunk index + offset
+#' to be resolved from a logical index with `log(n)` complexity.
+#'
+#' This implementation is preliminary and may change; however, the result of
+#' `as_nanoarrow_array_stream(some_vctr[begin:end])` should remain stable.
+#'
+#' @param x An object that works with [as_nanoarrow_array_stream()].
+#' @param subclass An optional subclass of nanoarrow_vctr to prepend to the
+#' final class name.
+#' @param ... Passed to [as_nanoarrow_array_stream()]
+#' @param schema An optional `schema`
+#'
+#' @return A vctr of class 'nanoarrow_vctr'
+#' @export
+#'
+#' @examples
+#' array <- as_nanoarrow_array(1:5)
+#' as_nanoarrow_vctr(array)
+#'
+as_nanoarrow_vctr <- function(x, ..., schema = NULL, subclass = character()) {
+ if (inherits(x, "nanoarrow_vctr") && is.null(schema)) {
+ return(x)
+ }
+
+ stream <- as_nanoarrow_array_stream(x, ..., schema = schema)
+ chunks <- collect_array_stream(stream, validate = FALSE)
+ new_nanoarrow_vctr(chunks, stream$get_schema(), subclass)
+}
+
+#' @rdname as_nanoarrow_vctr
+#' @export
+nanoarrow_vctr <- function(schema = NULL, subclass = character()) {
+ if (is.null(schema)) {
+ new_nanoarrow_vctr(list(), NULL, subclass)
+ } else {
+ new_nanoarrow_vctr(list(), as_nanoarrow_schema(schema), subclass)
+ }
+}
+
+new_nanoarrow_vctr <- function(chunks, schema, subclass = character()) {
+ offsets <- .Call(nanoarrow_c_vctr_chunk_offsets, chunks)
+ indices <- seq_len(offsets[length(offsets)])
+
+ structure(
+ indices,
+ schema = schema,
+ chunks = chunks,
+ offsets = offsets,
+ class = union(subclass, "nanoarrow_vctr")
+ )
+}
+
+#' @export
+`[.nanoarrow_vctr` <- function(x, i) {
+ attrs <- attributes(x)
+ x <- NextMethod()
+
+ if (is.null(vctr_as_slice(x))) {
+ stop(
+ "Can't subset nanoarrow_vctr with non-slice (e.g., only i:j indexing is
supported)"
+ )
+ }
+
+ attributes(x) <- attrs
+ x
+}
+
+#' @export
+`[<-.nanoarrow_vctr` <- function(x, i, value) {
+ stop("subset assignment for nanoarrow_vctr is not supported")
+}
+
+#' @export
+`[[<-.nanoarrow_vctr` <- function(x, i, value) {
+ stop("subset assignment for nanoarrow_vctr is not supported")
+}
+
+#' @export
+format.nanoarrow_vctr <- function(x, ...) {
+ if (length(x) == 0) {
+ return(character())
+ }
+
+ stream <- as_nanoarrow_array_stream(x)
+ converted <- convert_array_stream(stream)
+
+ # This needs to be a character() with the same length as x to work with
+ # RStudio's viewer. Data frames need special handling in this case.
+ size_stable_format(converted)
+}
+
+size_stable_format <- function(x, ...) {
+ if (inherits(x, "nanoarrow_vctr")) {
+ # Extension types could have a default convert that gives a nanoarrow_vctr.
+ # If this is the case, they should be returning a subclass with a format
+ # method that ensures we don't get here.
+ rep(sprintf("<%s[%d]>", class(x)[1], seq_along(x)))
+ } else if (inherits(x, "data.frame")) {
+ cols <- lapply(x, size_stable_format, ...)
+ cols <- Map(paste, names(x), cols, sep = ": ")
+ rows <- do.call(paste, c(cols, list(sep = ", ")))
+ paste0("{", rows, "}")
+ } else {
+ format(x, ...)
+ }
+}
+
+#' @export
+infer_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) {
+ attr(x, "schema", exact = TRUE)
+}
+
+# Because zero-length vctrs are R's way of communicating "type", implement
+# as_nanoarrow_schema() here so that it works in places that expect a type
+#' @export
+as_nanoarrow_schema.nanoarrow_vctr <- function(x, ...) {
+ attr(x, "schema", exact = TRUE)
+}
+
+#' @export
+as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) {
+ as_nanoarrow_array_stream.nanoarrow_vctr(x, ..., schema = schema)
+}
+
+#' @export
+as_nanoarrow_array_stream.nanoarrow_vctr <- function(x, ..., schema = NULL) {
+ if (!is.null(schema)) {
+ # If a schema is passed, first resolve the stream as is and then use
+ # as_nanoarrow_array_stream() to either cast (when this is supported)
+ # or error.
+ stream <- as_nanoarrow_array_stream(x, schema = NULL)
+ return(as_nanoarrow_array_stream(stream, schema = schema))
+ }
+
+ # Resolve the indices as c(1-based start, length)
+ slice <- vctr_as_slice(x)
+ if (is.null(slice)) {
+ stop("Can't resolve non-slice nanoarrow_vctr to nanoarrow_array_stream")
+ }
+
+ x_schema <- attr(x, "schema", exact = TRUE)
+
+ # Zero-size slice can be an array stream with zero batches
+ if (slice[2] == 0) {
+ return(basic_array_stream(list(), schema = x_schema))
+ }
+
+ # Full slice doesn't need slicing logic
+ offsets <- attr(x, "offsets", exact = TRUE)
+ batches <- attr(x, "chunks", exact = TRUE)
+ if (slice[1] == 1 && slice[2] == max(offsets)) {
+ return(
+ basic_array_stream(
+ batches,
+ schema = x_schema,
+ validate = FALSE
+ )
+ )
+ }
+
+ # Calculate first and last slice information
+ first_index <- slice[1] - 1L
+ end_index <- first_index + slice[2]
+ last_index <- end_index - 1L
+ first_chunk_index <- vctr_resolve_chunk(first_index, offsets)
+ last_chunk_index <- vctr_resolve_chunk(last_index, offsets)
+
+ first_chunk_offset <- first_index - offsets[first_chunk_index + 1L]
+ first_chunk_length <- offsets[first_chunk_index + 2L] - first_index
+ last_chunk_offset <- 0L
+ last_chunk_length <- end_index - offsets[last_chunk_index + 1L]
+
+ # Calculate first and last slices
+ if (first_chunk_index == last_chunk_index) {
+ batch <- vctr_array_slice(
+ batches[[first_chunk_index + 1L]],
+ first_chunk_offset,
+ last_chunk_length - first_chunk_offset
+ )
+
+ return(
+ basic_array_stream(
+ list(batch),
+ schema = x_schema,
+ validate = FALSE
+ )
+ )
+ }
+
+ batch1 <- vctr_array_slice(
+ batches[[first_chunk_index + 1L]],
+ first_chunk_offset,
+ first_chunk_length
+ )
+
+ batchn <- vctr_array_slice(
+ batches[[last_chunk_index + 1L]],
+ last_chunk_offset,
+ last_chunk_length
+ )
+
+ seq_mid <- seq_len(last_chunk_index - first_chunk_index - 1)
+ batch_mid <- batches[first_chunk_index + seq_mid]
+
+ basic_array_stream(
+ c(
+ list(batch1),
+ batch_mid,
+ list(batchn)
+ ),
+ schema = x_schema,
+ validate = FALSE
+ )
+}
+
+#' @export
+c.nanoarrow_vctr <- function(...) {
+ dots <- list(...)
+
+ # This one we can do safely
+ if (length(dots) == 1) {
+ return(dots[[1]])
+ }
+
+ stop("c() not implemented for nanoarrow_vctr()")
+}
+
+# Ensures that nanoarrow_vctr can fit in a data.frame
+#' @export
+as.data.frame.nanoarrow_vctr <- function(x, ..., optional = FALSE) {
+ if (!optional) {
+ stop(sprintf("cannot coerce object of tyoe '%s' to data.frame",
class(x)[1]))
+ } else {
+ new_data_frame(list(x), nrow = length(x))
+ }
+}
+
+#' @export
+print.nanoarrow_vctr <- function(x, ...) {
+ schema <- attr(x, "schema", exact = TRUE)
+ if (is.null(schema)) {
+ cat(sprintf("<%s <any>>\n", class(x)[1]))
+ return(invisible(x))
+ }
+
+ formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE)
+ cat(sprintf("<%s %s[%d]>\n", class(x)[1], formatted, length(x)))
+
+ n_values <- min(length(x), 20)
+ more_values <- length(x) - n_values
+ stream <- as_nanoarrow_array_stream(utils::head(x, n_values))
+ converted_head <- convert_array_stream(stream)
+
+ if (inherits(converted_head, "nanoarrow_vctr")) {
+ converted_head <- format(converted_head)
+ print(converted_head, quote = FALSE)
+ } else {
+ print(converted_head)
+ }
+
+ if (more_values >= 2) {
+ cat(sprintf("...and %d more values\n", more_values))
+ } else if (more_values >= 1) {
+ cat(sprintf("...and %d more value\n", more_values))
+ }
+
+ invisible(x)
+}
+
+#' @export
+str.nanoarrow_vctr <- function(object, ...) {
+ schema <- attr(object, "schema", exact = TRUE)
+ if (is.null(schema)) {
+ cat(sprintf("<%s <any>>\n", class(object)[1]))
+ return(invisible(object))
+ }
+
+ formatted <- nanoarrow_schema_formatted(schema, recursive = FALSE)
+ cat(sprintf("<%s %s[%d]>\n", class(object)[1], formatted, length(object)))
+
+ # Prints out the C data interface dump of each chunk with the chunk
+ # index above.
+ str(attr(object, "chunks"))
+
+ invisible(object)
+}
+
+# Utilities for vctr methods
+
+vctr_resolve_chunk <- function(x, offsets) {
+ .Call(nanoarrow_c_vctr_chunk_resolve, x, offsets)
+}
+
+vctr_as_slice <- function(x) {
+ .Call(nanoarrow_c_vctr_as_slice, x)
+}
+
+vctr_array_slice <- function(x, offset, length) {
+ new_offset <- x$offset + offset
+ new_length <- length
+ nanoarrow_array_modify(
+ x,
+ list(offset = new_offset, length = new_length),
+ validate = FALSE
+ )
+}
diff --git a/r/man/as_nanoarrow_vctr.Rd b/r/man/as_nanoarrow_vctr.Rd
new file mode 100644
index 00000000..1b6d1493
--- /dev/null
+++ b/r/man/as_nanoarrow_vctr.Rd
@@ -0,0 +1,56 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/vctr.R
+\name{as_nanoarrow_vctr}
+\alias{as_nanoarrow_vctr}
+\alias{nanoarrow_vctr}
+\title{Experimental Arrow encoded arrays as R vectors}
+\usage{
+as_nanoarrow_vctr(x, ..., schema = NULL, subclass = character())
+
+nanoarrow_vctr(schema = NULL, subclass = character())
+}
+\arguments{
+\item{x}{An object that works with
\code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}.}
+
+\item{...}{Passed to
\code{\link[=as_nanoarrow_array_stream]{as_nanoarrow_array_stream()}}}
+
+\item{schema}{An optional \code{schema}}
+
+\item{subclass}{An optional subclass of nanoarrow_vctr to prepend to the
+final class name.}
+}
+\value{
+A vctr of class 'nanoarrow_vctr'
+}
+\description{
+This experimental vctr class allows zero or more Arrow arrays to
+present as an R vector without converting them. This is useful for arrays
+with types that do not have a non-lossy R equivalent, and helps provide an
+intermediary object type where the default conversion is prohibitively
+expensive (e.g., a nested list of data frames). These objects will not
+survive many vctr transformations; however, they can be sliced without
+copying the underlying arrays.
+}
+\details{
+The nanoarrow_vctr is currently implemented similarly to \code{factor()}: its
+storage type is an \code{integer()} that is a sequence along the total length
+of the vctr and there are attributes that are required to resolve these
+indices to an array + offset. Sequences typically have a very compact
+representation in recent versions of R such that this has a cheap storage
+footprint even for large arrays. The attributes are currently:
+\itemize{
+\item \code{schema}: The \link[=as_nanoarrow_schema]{nanoarrow_schema} shared
by each chunk.
+\item \code{chunks}: A \code{list()} of \code{nanoarrow_array}.
+\item \code{offsets}: An \code{integer()} vector beginning with \code{0} and
followed by the
+cumulative length of each chunk. This allows the chunk index + offset
+to be resolved from a logical index with \code{log(n)} complexity.
+}
+
+This implementation is preliminary and may change; however, the result of
+\code{as_nanoarrow_array_stream(some_vctr[begin:end])} should remain stable.
+}
+\examples{
+array <- as_nanoarrow_array(1:5)
+as_nanoarrow_vctr(array)
+
+}
diff --git a/r/man/nanoarrow-package.Rd b/r/man/nanoarrow-package.Rd
index 5a2fc6ec..106c080a 100644
--- a/r/man/nanoarrow-package.Rd
+++ b/r/man/nanoarrow-package.Rd
@@ -11,6 +11,7 @@ Provides an 'R' interface to the 'nanoarrow' 'C' library and
the 'Apache Arrow'
\seealso{
Useful links:
\itemize{
+ \item \url{https://arrow.apache.org/nanoarrow/latest/r/}
\item \url{https://github.com/apache/arrow-nanoarrow}
\item Report bugs at \url{https://github.com/apache/arrow-nanoarrow/issues}
}
diff --git a/r/src/convert.c b/r/src/convert.c
index 2148421a..e232d67d 100644
--- a/r/src/convert.c
+++ b/r/src/convert.c
@@ -414,11 +414,7 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP current_result = VECTOR_ELT(converter_shelter, 4);
- // Materialize never called (e.g., empty stream)
- if (current_result == R_NilValue) {
- NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0));
- current_result = VECTOR_ELT(converter_shelter, 4);
- }
+
NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(converter_xptr));
// Check result size. A future implementation could also shrink the length
// or reallocate a shorter vector.
@@ -437,15 +433,19 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) {
SEXP nanoarrow_converter_release_result(SEXP converter_xptr) {
struct RConverter* converter = (struct
RConverter*)R_ExternalPtrAddr(converter_xptr);
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+
// PROTECT()ing here because we are about to release the object from the
// shelter of the converter and return it
SEXP result = PROTECT(VECTOR_ELT(converter_shelter, 4));
SET_VECTOR_ELT(converter_shelter, 4, R_NilValue);
+
+ // Reset the converter state
converter->dst.vec_sexp = R_NilValue;
converter->dst.offset = 0;
converter->dst.length = 0;
converter->size = 0;
converter->capacity = 0;
+
UNPROTECT(1);
return result;
}
diff --git a/r/src/convert_array_stream.c b/r/src/convert_array_stream.c
index b5315f38..4c5a16f5 100644
--- a/r/src/convert_array_stream.c
+++ b/r/src/convert_array_stream.c
@@ -26,6 +26,46 @@
#include "convert.h"
#include "schema.h"
+static int convert_next(SEXP converter_xptr, struct ArrowArrayStream* stream,
+ SEXP schema_xptr, int64_t* n_batches) {
+ SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr());
+ struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr);
+
+ // Fetch the next array
+ int result = ArrowArrayStreamGetNext(stream, array, NULL);
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayStream::get_next(): %s",
ArrowArrayStreamGetLastError(stream));
+ }
+
+ // Check if the stream is finished
+ if (array->release == NULL) {
+ UNPROTECT(1);
+ return 0;
+ }
+
+ // Bump the batch counter
+ (*n_batches)++;
+
+ // Set the schema of the allocated array and pass it to the converter
+ R_SetExternalPtrTag(array_xptr, schema_xptr);
+ if (nanoarrow_converter_set_array(converter_xptr, array_xptr) !=
NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ // After set_array, the converter is responsible for the array_xptr
+ UNPROTECT(1);
+
+ // Materialize the array into the converter
+ int64_t n_materialized =
+ nanoarrow_converter_materialize_n(converter_xptr, array->length);
+ if (n_materialized != array->length) {
+ Rf_error("Expected to materialize %ld values in batch %ld but materialized
%ld",
+ (long)array->length, (long)(*n_batches), (long)n_materialized);
+ }
+
+ return 1;
+}
+
SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp,
SEXP size_sexp, SEXP n_sexp) {
struct ArrowArrayStream* array_stream =
@@ -58,49 +98,18 @@ SEXP nanoarrow_c_convert_array_stream(SEXP
array_stream_xptr, SEXP ptype_sexp,
nanoarrow_converter_stop(converter_xptr);
}
- SEXP array_xptr = PROTECT(nanoarrow_array_owning_xptr());
- struct ArrowArray* array = nanoarrow_output_array_from_xptr(array_xptr);
-
int64_t n_batches = 0;
- int64_t n_materialized = 0;
- if (n > 0) {
- result = ArrowArrayStreamGetNext(array_stream, array, NULL);
- n_batches++;
- if (result != NANOARROW_OK) {
- Rf_error("ArrowArrayStream::get_next(): %s",
- ArrowArrayStreamGetLastError(array_stream));
+ do {
+ if (n_batches >= n) {
+ break;
}
-
- while (array->release != NULL) {
- if (nanoarrow_converter_set_array(converter_xptr, array_xptr) !=
NANOARROW_OK) {
- nanoarrow_converter_stop(converter_xptr);
- }
-
- n_materialized = nanoarrow_converter_materialize_n(converter_xptr,
array->length);
- if (n_materialized != array->length) {
- Rf_error("Expected to materialize %ld values in batch %ld but
materialized %ld",
- (long)array->length, (long)n_batches, (long)n_materialized);
- }
-
- if (n_batches >= n) {
- break;
- }
-
- array->release(array);
- result = ArrowArrayStreamGetNext(array_stream, array, NULL);
- n_batches++;
- if (result != NANOARROW_OK) {
- Rf_error("ArrowArrayStream::get_next(): %s",
- ArrowArrayStreamGetLastError(array_stream));
- }
- }
- }
+ } while (convert_next(converter_xptr, array_stream, schema_xptr,
&n_batches));
if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) {
nanoarrow_converter_stop(converter_xptr);
}
SEXP result_sexp =
PROTECT(nanoarrow_converter_release_result(converter_xptr));
- UNPROTECT(4);
+ UNPROTECT(3);
return result_sexp;
}
diff --git a/r/src/init.c b/r/src/init.c
index 913ea77a..69c94391 100644
--- a/r/src/init.c
+++ b/r/src/init.c
@@ -91,6 +91,9 @@ extern SEXP nanoarrow_c_schema_set_dictionary(SEXP
schema_mut_xptr, SEXP diction
extern SEXP nanoarrow_c_preserved_count(void);
extern SEXP nanoarrow_c_preserved_empty(void);
extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj);
+extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list);
+extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP
offsets_sexp);
+extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp);
extern SEXP nanoarrow_c_version(void);
extern SEXP nanoarrow_c_version_runtime(void);
@@ -165,6 +168,9 @@ static const R_CallMethodDef CallEntries[] = {
{"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0},
{"nanoarrow_c_preserve_and_release_on_other_thread",
(DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1},
+ {"nanoarrow_c_vctr_chunk_offsets",
(DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1},
+ {"nanoarrow_c_vctr_chunk_resolve",
(DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2},
+ {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1},
{"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0},
{"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0},
{NULL, NULL, 0}};
diff --git a/r/src/materialize.c b/r/src/materialize.c
index c2bca993..45d6950d 100644
--- a/r/src/materialize.c
+++ b/r/src/materialize.c
@@ -19,8 +19,9 @@
#include <R.h>
#include <Rinternals.h>
+#include "array.h"
#include "nanoarrow.h"
-
+#include "nanoarrow/r.h"
#include "util.h"
// Needed for the list_of materializer
@@ -108,6 +109,10 @@ int nanoarrow_ptype_is_data_frame(SEXP ptype) {
(Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol)));
}
+int nanoarrow_ptype_is_nanoarrow_vctr(SEXP ptype) {
+ return Rf_inherits(ptype, "nanoarrow_vctr");
+}
+
SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
SEXP result;
@@ -122,7 +127,27 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t
len) {
}
}
- if (nanoarrow_ptype_is_data_frame(ptype)) {
+ if (nanoarrow_ptype_is_nanoarrow_vctr(ptype)) {
+ // The object we return here is one that will accumulate chunks and
+ // be finalized with a value (rather than being strictly copied into
+ // after every new chunk is seen).
+ result = PROTECT(Rf_allocVector(INTSXP, len));
+ Rf_copyMostAttrib(ptype, result);
+
+ // For the purposes of building the list of chunks, chunks is a pairlist
+ // (it will be converted to a regular list when this converter is
finalized)
+ // Technically the first value here won't be used (this simplifies the
+ // appending).
+ SEXP chunks_list = PROTECT(Rf_list1(R_NilValue));
+
+ // To start, the chunks list and the end of the chunks list are the same
node
+ SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail"));
+ SEXP chunks_sym = PROTECT(Rf_install("chunks"));
+ Rf_setAttrib(result, chunks_sym, chunks_list);
+ Rf_setAttrib(result, chunks_tail_sym, chunks_list);
+
+ UNPROTECT(3);
+ } else if (nanoarrow_ptype_is_data_frame(ptype)) {
R_xlen_t num_cols = Rf_xlength(ptype);
result = PROTECT(Rf_allocVector(VECSXP, num_cols));
for (R_xlen_t i = 0; i < num_cols; i++) {
@@ -207,6 +232,10 @@ static void fill_vec_with_nulls(SEXP x, R_xlen_t offset,
R_xlen_t len) {
}
static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) {
+ if (nanoarrow_ptype_is_nanoarrow_vctr(dst)) {
+ Rf_error("Can't copy_vec_into() to nanoarrow_vctr");
+ }
+
if (nanoarrow_ptype_is_data_frame(dst)) {
if (!nanoarrow_ptype_is_data_frame(x)) {
Rf_error("Expected record-style vctr result but got non-record-style
result");
@@ -270,6 +299,107 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t
offset, R_xlen_t len) {
}
}
+int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SEXP result = VECTOR_ELT(converter_shelter, 4);
+
+ // Materialize never called (e.g., empty stream)
+ if (result == R_NilValue) {
+ NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0));
+ result = VECTOR_ELT(converter_shelter, 4);
+ }
+
+ if (nanoarrow_ptype_is_nanoarrow_vctr(result)) {
+ // Get the schema for this converter. Technically this will overwrite
+ // a schema that was provided explicitly; however, we currently do not
+ // handle that case.
+
+ SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1);
+
+ // We no longer need to keep track of chunks_tail
+ SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail"));
+ Rf_setAttrib(result, chunks_tail_sym, R_NilValue);
+
+ // We also want to pass on the class of the ptype we received
+ SEXP subclass_sexp = Rf_getAttrib(result, R_ClassSymbol);
+
+ // We no longer need the first element of the pairlist, which was
+ // intentionally set to R_NilValue.
+ SEXP chunks_sym = PROTECT(Rf_install("chunks"));
+ SEXP chunks_pairlist0 = Rf_getAttrib(result, chunks_sym);
+
+ // If there were zero chunks, there will be no "first" node
+ SEXP chunks_list;
+ if (CDR(chunks_pairlist0) == R_NilValue) {
+ chunks_list = PROTECT(Rf_allocVector(VECSXP, 0));
+ } else {
+ chunks_list = PROTECT(Rf_PairToVectorList(CDR(chunks_pairlist0)));
+ }
+
+ // Set up the call to new_nanoarrow_vctr
+ SEXP new_nanoarrow_vctr_sym = PROTECT(Rf_install("new_nanoarrow_vctr"));
+ SEXP new_nanoarrow_vctr_call = PROTECT(
+ Rf_lang4(new_nanoarrow_vctr_sym, chunks_list, schema_xptr,
subclass_sexp));
+ SEXP final_result = PROTECT(Rf_eval(new_nanoarrow_vctr_call,
nanoarrow_ns_pkg));
+
+ SET_VECTOR_ELT(converter_shelter, 4, final_result);
+ UNPROTECT(6);
+ } else if (nanoarrow_ptype_is_data_frame(result)) {
+ // For each child, finalize the result and then reassign it
+ SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
+ for (R_xlen_t i = 0; i < Rf_xlength(child_converter_xptrs); i++) {
+ SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i);
+ NANOARROW_RETURN_NOT_OK(
+ nanoarrow_materialize_finalize_result(child_converter_xptr));
+
+ SEXP child_result =
+ PROTECT(nanoarrow_converter_release_result(child_converter_xptr));
+ SET_VECTOR_ELT(result, i, child_result);
+ UNPROTECT(1);
+ }
+ }
+
+ return NANOARROW_OK;
+}
+
+static int nanoarrow_materialize_nanoarrow_vctr(struct RConverter* converter,
+ SEXP converter_xptr) {
+ // This is a case where the callee needs ownership, which we can do via a
+ // shallow copy.
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SEXP schema_xptr = VECTOR_ELT(converter_shelter, 1);
+ SEXP array_xptr = VECTOR_ELT(converter_shelter, 2);
+
+ SEXP array_out_xptr = PROTECT(nanoarrow_array_owning_xptr());
+ struct ArrowArray* out_array =
nanoarrow_output_array_from_xptr(array_out_xptr);
+ array_export(array_xptr, out_array);
+ R_SetExternalPtrTag(array_out_xptr, schema_xptr);
+
+ // Update the offset/length in case a slice is being requested from the
+ // converter.
+ out_array->offset += converter->src.offset;
+ out_array->length = converter->src.length;
+
+ // Get the cached copy of the pairlist node at the end of the current
+ // chunks list.
+ SEXP chunks_tail_sym = PROTECT(Rf_install("chunks_tail"));
+ SEXP chunks_tail = PROTECT(Rf_getAttrib(converter->dst.vec_sexp,
chunks_tail_sym));
+
+ // Create a length-1 pairlist node containing the chunk
+ SEXP next_sexp = PROTECT(Rf_cons(array_out_xptr, R_NilValue));
+
+ // Append it to the end of the current pairlist
+ SETCDR(chunks_tail, next_sexp);
+ UNPROTECT(1);
+
+ // Update the cached copy of the pairlist node at the end of the current
+ // chunks list.
+ Rf_setAttrib(converter->dst.vec_sexp, chunks_tail_sym, next_sexp);
+ UNPROTECT(3);
+
+ return NANOARROW_OK;
+}
+
static int nanoarrow_materialize_other(struct RConverter* converter,
SEXP converter_xptr) {
// Ensure that we have a ptype SEXP to send in the call back to R
@@ -280,20 +410,19 @@ static int nanoarrow_materialize_other(struct RConverter*
converter,
UNPROTECT(1);
}
- // A unique situation where we don't want owning external pointers because
we know
- // these are protected for the duration of our call into R and because we
don't want
- // the underlying array to be released and invalidate the converter. The R
code in
- // convert_fallback_other() takes care of ensuring an independent copy with
the correct
- // offset/length.
- SEXP schema_xptr = PROTECT(R_MakeExternalPtr(
- (struct ArrowSchema*)converter->schema_view.schema, R_NilValue,
R_NilValue));
- Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
- // We do need to set the protected member of the array external pointer to
signal that
- // it is not an independent array (i.e., force a shallow copy).
- SEXP array_xptr = PROTECT(R_MakeExternalPtr(
- (struct ArrowArray*)converter->array_view.array, schema_xptr,
converter_xptr));
- Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
+ // Special-case the nanoarrow_vctr conversion
+ if (Rf_inherits(converter->dst.vec_sexp, "nanoarrow_vctr")) {
+ return nanoarrow_materialize_nanoarrow_vctr(converter, converter_xptr);
+ }
+
+ // We've ensured proper ownership of array_xptr and ensured that its
+ // schema is set, so we can pass these safely to the R-level
+ // convert_fallback_other.
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SEXP array_xptr = VECTOR_ELT(converter_shelter, 2);
+ // The R code in convert_fallback_other() takes care of ensuring an
independent copy
+ // with the correct offset/length if it is necessary to update them.
SEXP offset_sexp = PROTECT(
Rf_ScalarReal((double)(converter->src.array_view->offset +
converter->src.offset)));
SEXP length_sexp = PROTECT(Rf_ScalarReal((double)converter->src.length));
@@ -307,7 +436,7 @@ static int nanoarrow_materialize_other(struct RConverter*
converter,
copy_vec_into(result_src, converter->dst.vec_sexp, converter->dst.offset,
converter->dst.length);
- UNPROTECT(7);
+ UNPROTECT(5);
return NANOARROW_OK;
}
diff --git a/r/src/materialize.h b/r/src/materialize.h
index a8f36cc9..c3b2c5cb 100644
--- a/r/src/materialize.h
+++ b/r/src/materialize.h
@@ -42,4 +42,8 @@ int nanoarrow_materialize(struct RConverter* converter, SEXP
converter_xptr);
SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len);
SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len);
+// Finalize an object before returning to R. Currently only used for
+// nanoarrow_vctr conversion.
+int nanoarrow_materialize_finalize_result(SEXP converter_xptr);
+
#endif
diff --git a/r/src/vctr.c b/r/src/vctr.c
new file mode 100644
index 00000000..e03fdfee
--- /dev/null
+++ b/r/src/vctr.c
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+#include "nanoarrow/r.h"
+
+SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list) {
+ int num_chunks = Rf_length(array_list);
+ SEXP offsets_sexp = PROTECT(Rf_allocVector(INTSXP, num_chunks + 1));
+ int* offsets = INTEGER(offsets_sexp);
+ offsets[0] = 0;
+ int64_t cumulative_offset = 0;
+
+ struct ArrowArray* array;
+ for (int i = 0; i < num_chunks; i++) {
+ array = nanoarrow_array_from_xptr(VECTOR_ELT(array_list, i));
+ cumulative_offset += array->length;
+ if (cumulative_offset > INT_MAX) {
+ Rf_error("Can't build nanoarrow_vctr with length > INT_MAX"); // # nocov
+ }
+
+ offsets[i + 1] = cumulative_offset;
+ }
+
+ UNPROTECT(1);
+ return offsets_sexp;
+}
+
+SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp) {
+ int* offsets = INTEGER(offsets_sexp);
+ int n_offsets = Rf_length(offsets_sexp);
+ int last_offset = offsets[n_offsets - 1];
+
+ int n = Rf_length(indices_sexp);
+ SEXP chunk_indices_sexp = PROTECT(Rf_allocVector(INTSXP, n));
+ int* chunk_indices = INTEGER(chunk_indices_sexp);
+
+ int buf[1024];
+ for (int i = 0; i < n; i++) {
+ if (i % 1024 == 0) {
+ INTEGER_GET_REGION(indices_sexp, i, 1024, buf);
+ }
+ int index0 = buf[i % 1024];
+
+ if (index0 < 0 || index0 > last_offset) {
+ chunk_indices[i] = NA_INTEGER;
+ } else {
+ chunk_indices[i] = ArrowResolveChunk32(index0, offsets, 0, n_offsets);
+ }
+ }
+
+ UNPROTECT(1);
+ return chunk_indices_sexp;
+}
+
+SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp) {
+ if (TYPEOF(indices_sexp) != INTSXP) {
+ return R_NilValue;
+ }
+ SEXP slice_sexp = PROTECT(Rf_allocVector(INTSXP, 2));
+ int* slice = INTEGER(slice_sexp);
+
+ int n = Rf_length(indices_sexp);
+ slice[1] = n;
+
+ if (n == 1) {
+ slice[0] = INTEGER_ELT(indices_sexp, 0);
+ UNPROTECT(1);
+ return slice_sexp;
+ } else if (n == 0) {
+ slice[0] = NA_INTEGER;
+ UNPROTECT(1);
+ return slice_sexp;
+ }
+
+ // It may be possible to check for the R ALTREP sequence type,
+ // which would eliminate the need for the below check for
+ // sequential values.
+
+ int buf[1024];
+ INTEGER_GET_REGION(indices_sexp, 0, 1024, buf);
+ slice[0] = buf[0];
+
+ int last_value = buf[0];
+ int this_value = 0;
+
+ for (int i = 1; i < n; i++) {
+ if (i % 1024 == 0) {
+ INTEGER_GET_REGION(indices_sexp, i, 1024, buf);
+ }
+
+ this_value = buf[i % 1024];
+ if ((this_value - last_value) != 1) {
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ last_value = this_value;
+ }
+
+ UNPROTECT(1);
+ return slice_sexp;
+}
diff --git a/r/tests/testthat/test-convert-array.R
b/r/tests/testthat/test-convert-array.R
index 5731ce10..17d192fd 100644
--- a/r/tests/testthat/test-convert-array.R
+++ b/r/tests/testthat/test-convert-array.R
@@ -160,6 +160,121 @@ test_that("convert to vector works for tibble", {
)
})
+test_that("convert to vector works for nanoarrow_vctr()", {
+ array <- as_nanoarrow_array(c("one", "two", "three"))
+
+ # Check implicit/inferred nanoarrow_vctr() schema
+ vctr <- convert_array(array, nanoarrow_vctr())
+ expect_s3_class(vctr, "nanoarrow_vctr")
+ expect_length(vctr, 3)
+ schema <- infer_nanoarrow_schema(vctr)
+ expect_identical(schema$format, "u")
+
+ # Check with explicit schema of the correct type
+ vctr <- convert_array(array, nanoarrow_vctr(na_string()))
+ expect_s3_class(vctr, "nanoarrow_vctr")
+ expect_length(vctr, 3)
+ schema <- infer_nanoarrow_schema(vctr)
+ expect_identical(schema$format, "u")
+
+ # Check conversion of a struct array
+ df <- data.frame(x = c("one", "two", "three"))
+ array <- as_nanoarrow_array(df)
+
+ vctr <- convert_array(array, nanoarrow_vctr())
+ expect_s3_class(vctr, "nanoarrow_vctr")
+ expect_length(vctr, 3)
+ schema <- infer_nanoarrow_schema(vctr)
+ expect_identical(schema$format, "+s")
+
+ vctr <- convert_array(array, nanoarrow_vctr(na_struct(list(x =
na_string()))))
+ expect_s3_class(vctr, "nanoarrow_vctr")
+ expect_length(vctr, 3)
+ schema <- infer_nanoarrow_schema(vctr)
+ expect_identical(schema$format, "+s")
+})
+
+test_that("batched convert to vector works for nanoarrow_vctr()", {
+ empty_stream <- basic_array_stream(list(), schema = na_string())
+ empty_vctr <- convert_array_stream(empty_stream, nanoarrow_vctr())
+ expect_length(empty_vctr, 0)
+ expect_identical(infer_nanoarrow_schema(empty_vctr)$format, "u")
+
+ stream1 <- basic_array_stream(list(c("one", "two", "three")))
+ vctr1 <- convert_array_stream(stream1, nanoarrow_vctr())
+ expect_length(vctr1, 3)
+
+ stream2 <- basic_array_stream(
+ list(c("one", "two", "three"), c("four", "five", "six", "seven"))
+ )
+ vctr2 <- convert_array_stream(stream2, nanoarrow_vctr())
+ expect_length(vctr2, 7)
+ expect_identical(
+ convert_array_stream(as_nanoarrow_array_stream(vctr2)),
+ c("one", "two", "three", "four", "five", "six", "seven")
+ )
+})
+
+test_that("convert to vector works for data.frame(nanoarrow_vctr())", {
+ array <- as_nanoarrow_array(data.frame(x = 1:5))
+ df_vctr <- convert_array(array, data.frame(x = nanoarrow_vctr()))
+ expect_s3_class(df_vctr$x, "nanoarrow_vctr")
+ expect_identical(
+ convert_array_stream(as_nanoarrow_array_stream(df_vctr$x)),
+ 1:5
+ )
+})
+
+test_that("convert to vector works for list_of(nanoarrow_vctr())", {
+ skip_if_not_installed("arrow")
+ skip_if_not_installed("vctrs")
+
+ array <- as_nanoarrow_array(
+ list(1:5, 6:10, NULL, 11:13),
+ schema = na_list(na_int32())
+ )
+
+ list_vctr <- convert_array(array, vctrs::list_of(nanoarrow_vctr()))
+
+ # Each item in the list should be a vctr with one chunk that is a slice
+ # of the original array
+ expect_s3_class(list_vctr[[1]], "nanoarrow_vctr")
+ vctr_array <- attr(list_vctr[[1]], "chunks")[[1]]
+ expect_identical(vctr_array$offset, 0L)
+ expect_identical(vctr_array$length, 5L)
+ expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:5)
+
+ expect_s3_class(list_vctr[[2]], "nanoarrow_vctr")
+ vctr_array <- attr(list_vctr[[2]], "chunks")[[1]]
+ expect_identical(vctr_array$offset, 5L)
+ expect_identical(vctr_array$length, 5L)
+ expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:10)
+
+ expect_null(list_vctr[[3]])
+
+ expect_s3_class(list_vctr[[4]], "nanoarrow_vctr")
+ vctr_array <- attr(list_vctr[[4]], "chunks")[[1]]
+ expect_identical(vctr_array$offset, 10L)
+ expect_identical(vctr_array$length, 3L)
+ expect_identical(convert_buffer(vctr_array$buffers[[2]]), 1:13)
+})
+
+test_that("batched convert to vector works for nanoarrow_vctr() keeps
subclass", {
+ vctr_ptype <- nanoarrow_vctr(subclass = "some_subclass")
+
+ empty_stream <- basic_array_stream(list(), schema = na_string())
+ empty_vctr <- convert_array_stream(empty_stream, vctr_ptype)
+ expect_s3_class(empty_vctr, "some_subclass")
+
+ stream1 <- basic_array_stream(list(c("")))
+ vctr1 <- convert_array_stream(stream1, vctr_ptype)
+ expect_s3_class(vctr1, "some_subclass")
+
+ stream2 <- basic_array_stream(list(c(""), c("")))
+ vctr2 <- convert_array_stream(stream2, vctr_ptype)
+ expect_s3_class(vctr2, "some_subclass")
+})
+
test_that("convert to vector works for struct-style vectors", {
array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax"))
expect_identical(
diff --git a/r/tests/testthat/test-extension.R
b/r/tests/testthat/test-extension.R
index ceb5717b..5935394c 100644
--- a/r/tests/testthat/test-extension.R
+++ b/r/tests/testthat/test-extension.R
@@ -114,3 +114,40 @@ test_that("as_nanoarrow_array() dispatches on registered
extension spec", {
"some_ext"
)
})
+
+test_that("extensions can infer a schema of a nanoarrow_vctr() subclass", {
+ register_nanoarrow_extension(
+ "some_ext",
+ nanoarrow_extension_spec(subclass = "vctr_spec_class")
+ )
+ on.exit(unregister_nanoarrow_extension("some_ext"))
+
+ infer_nanoarrow_ptype_extension.vctr_spec_class <- function(spec, x, ...) {
+ nanoarrow_vctr(subclass = "some_vctr_subclass")
+ }
+
+ s3_register(
+ "nanoarrow::infer_nanoarrow_ptype_extension",
+ "vctr_spec_class",
+ infer_nanoarrow_ptype_extension.vctr_spec_class
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(na_extension(na_string(), "some_ext")),
+ nanoarrow_vctr(subclass = "some_vctr_subclass")
+ )
+
+ ext_array <- nanoarrow_extension_array(c("one", "two", "three"), "some_ext")
+ vctr <- convert_array(ext_array)
+ expect_s3_class(vctr, "some_vctr_subclass")
+
+ # Ensure that registering a default conversion that returns a nanoarrow_vctr
+ # does not result in infinite recursion when printing or formatting it.
+ # An extension that does this should provide these methods for the subclass
+ # they return.
+ expect_length(format(vctr), length(vctr))
+ expect_output(
+ expect_identical(print(vctr), vctr),
+ "some_vctr_subclass"
+ )
+})
diff --git a/r/tests/testthat/test-vctr.R b/r/tests/testthat/test-vctr.R
new file mode 100644
index 00000000..95bbd14f
--- /dev/null
+++ b/r/tests/testthat/test-vctr.R
@@ -0,0 +1,237 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("as_nanoarrow_vctr() works for basic input", {
+ array <- as_nanoarrow_array(c("one", "two"))
+ vctr <- as_nanoarrow_vctr(array)
+ expect_identical(as.integer(unclass(vctr)), 1:2)
+ expect_identical(as_nanoarrow_vctr(vctr), vctr)
+
+ expect_identical(infer_nanoarrow_schema(vctr)$format, "u")
+ expect_identical(as_nanoarrow_schema(vctr)$format, "u")
+})
+
+test_that("print() and str() work on empty nanoarrow_vctr", {
+ vctr <- nanoarrow_vctr()
+ expect_identical(
+ expect_output(print(vctr),"<nanoarrow_vctr <any>>"),
+ vctr
+ )
+
+ expect_identical(
+ expect_output(str(vctr), "<nanoarrow_vctr <any>>"),
+ vctr
+ )
+
+ vctr <- nanoarrow_vctr(na_int32())
+ expect_identical(
+ expect_output(print(vctr), "^<nanoarrow_vctr int32"),
+ vctr
+ )
+
+ expect_identical(
+ expect_output(str(vctr), "^<nanoarrow_vctr int32"),
+ vctr
+ )
+})
+
+test_that("print() and str() work on non-empty nanoarrow_vctr", {
+ array <- as_nanoarrow_array(c("one", "two"))
+ vctr <- as_nanoarrow_vctr(array)
+
+ expect_output(
+ expect_identical(print(vctr), vctr),
+ '"one" "two"'
+ )
+
+ expect_output(
+ expect_identical(str(vctr), vctr),
+ "List of 1"
+ )
+})
+
+test_that("nanoarrow_vctr() errors when c() is called", {
+ vctr <- nanoarrow_vctr(na_int32())
+ expect_identical(c(vctr), vctr)
+
+ expect_error(c(vctr, vctr), "not implemented")
+})
+
+test_that("nanoarrow_vctr works in a data.frame()", {
+ array <- as_nanoarrow_array(c("one", "two"))
+ vctr <- as_nanoarrow_vctr(array)
+ df <- data.frame(x = vctr)
+ expect_s3_class(df$x, "nanoarrow_vctr")
+
+ expect_error(as.data.frame(vctr), "cannot coerce object")
+})
+
+test_that("format() works for nanoarrow_vctr", {
+ empty_vctr <- nanoarrow_vctr(na_string())
+ expect_identical(format(empty_vctr), character())
+
+ array <- as_nanoarrow_array(c("one", "two"))
+ vctr <- as_nanoarrow_vctr(array)
+ expect_identical(format(vctr), format(c("one", "two")))
+
+ # Also check on a data.frame(), which needs some custom logic to work with
+ # the RStudio viewer
+ array <- as_nanoarrow_array(data.frame(x = 1:2, y = letters[1:2]))
+ vctr <- as_nanoarrow_vctr(array)
+ expect_identical(format(vctr), c("{x: 1, y: a}", "{x: 2, y: b}"))
+})
+
+test_that("nanoarrow_vctr to stream generates an empty stream for empty
slice", {
+ vctr <- new_nanoarrow_vctr(list(), na_string())
+ stream <- as_nanoarrow_array_stream(vctr)
+ schema_out <- stream$get_schema()
+ expect_identical(schema_out$format, "u")
+ expect_identical(collect_array_stream(stream), list())
+})
+
+test_that("nanoarrow_vctr to stream generates identical stream for identity
slice", {
+ array <- as_nanoarrow_array("one")
+ vctr <- new_nanoarrow_vctr(list(array), infer_nanoarrow_schema(array))
+
+ stream <- as_nanoarrow_array_stream(vctr)
+ schema_out <- stream$get_schema()
+ expect_identical(schema_out$format, "u")
+
+ collected <- collect_array_stream(stream)
+ expect_length(collected, 1)
+ expect_identical(
+ convert_buffer(array$buffers[[3]]),
+ "one"
+ )
+})
+
+test_that("nanoarrow_vctr to stream works for arbitrary slices", {
+ array1 <- as_nanoarrow_array(c("one", "two", "three"))
+ array2 <- as_nanoarrow_array(c("four", "five", "six", "seven"))
+ vctr <- new_nanoarrow_vctr(list(array1, array2),
infer_nanoarrow_schema(array1))
+
+ chunks16 <- collect_array_stream(
+ as_nanoarrow_array_stream(vctr[1:6])
+ )
+ expect_length(chunks16, 2)
+ expect_identical(chunks16[[1]]$offset, 0L)
+ expect_identical(chunks16[[1]]$length, 3L)
+ expect_identical(chunks16[[2]]$offset, 0L)
+ expect_identical(chunks16[[2]]$length, 3L)
+
+ chunks34 <- collect_array_stream(
+ as_nanoarrow_array_stream(vctr[3:4])
+ )
+ expect_length(chunks34, 2)
+ expect_identical(chunks34[[1]]$offset, 2L)
+ expect_identical(chunks34[[1]]$length, 1L)
+ expect_identical(chunks34[[2]]$offset, 0L)
+ expect_identical(chunks34[[2]]$length, 1L)
+
+ chunks13 <- collect_array_stream(
+ as_nanoarrow_array_stream(vctr[1:3])
+ )
+ expect_length(chunks13, 1)
+ expect_identical(chunks13[[1]]$offset, 0L)
+ expect_identical(chunks13[[1]]$length, 3L)
+
+ chunks46 <- collect_array_stream(
+ as_nanoarrow_array_stream(vctr[4:6])
+ )
+ expect_length(chunks46, 1)
+ expect_identical(chunks46[[1]]$offset, 0L)
+ expect_identical(chunks46[[1]]$length, 3L)
+
+ chunks56 <- collect_array_stream(
+ as_nanoarrow_array_stream(vctr[5:6])
+ )
+ expect_length(chunks56, 1)
+ expect_identical(chunks56[[1]]$offset, 1L)
+ expect_identical(chunks56[[1]]$length, 2L)
+
+ chunks57 <- collect_array_stream(
+ as_nanoarrow_array_stream(vctr[5:7])
+ )
+ expect_length(chunks57, 1)
+ expect_identical(chunks57[[1]]$offset, 1L)
+ expect_identical(chunks57[[1]]$length, 3L)
+})
+
+test_that("Errors occur for unsupported subset operations", {
+ array <- as_nanoarrow_array("one")
+ vctr <- as_nanoarrow_vctr(array)
+ expect_error(
+ vctr[5:1],
+ "Can't subset nanoarrow_vctr with non-slice"
+ )
+
+ expect_error(
+ vctr[1] <- "something",
+ "subset assignment for nanoarrow_vctr is not supported"
+ )
+
+ expect_error(
+ vctr[[1]] <- "something",
+ "subset assignment for nanoarrow_vctr is not supported"
+ )
+})
+
+test_that("slice detector works", {
+ expect_identical(
+ vctr_as_slice(logical()),
+ NULL
+ )
+
+ expect_identical(
+ vctr_as_slice(2:1),
+ NULL
+ )
+
+ expect_identical(
+ vctr_as_slice(integer()),
+ c(NA_integer_, 0L)
+ )
+
+ expect_identical(
+ vctr_as_slice(2L),
+ c(2L, 1L)
+ )
+
+ expect_identical(
+ vctr_as_slice(1:10),
+ c(1L, 10L)
+ )
+
+ expect_identical(
+ vctr_as_slice(10:2048),
+ c(10L, (2048L - 10L + 1L))
+ )
+})
+
+test_that("chunk resolver works", {
+ chunk_offset1 <- 0:10
+
+ expect_identical(
+ vctr_resolve_chunk(c(-1L, 11L), chunk_offset1),
+ c(NA_integer_, NA_integer_)
+ )
+
+ expect_identical(
+ vctr_resolve_chunk(9:0, chunk_offset1),
+ 9:0
+ )
+})
diff --git a/src/nanoarrow/buffer_inline.h b/src/nanoarrow/buffer_inline.h
index 54a00a92..e68de587 100644
--- a/src/nanoarrow/buffer_inline.h
+++ b/src/nanoarrow/buffer_inline.h
@@ -50,6 +50,27 @@ static inline int64_t ArrowResolveChunk64(int64_t index,
const int64_t* offsets,
return lo;
}
+static inline int64_t ArrowResolveChunk32(int32_t index, const int32_t*
offsets,
+ int32_t lo, int32_t hi) {
+ // Similar to std::upper_bound(), but slightly different as our offsets
+ // array always starts with 0.
+ int32_t n = hi - lo;
+ // First iteration does not need to check for n > 1
+ // (lo < hi is guaranteed by the precondition).
+ NANOARROW_DCHECK(n > 1);
+ do {
+ const int32_t m = n >> 1;
+ const int32_t mid = lo + m;
+ if (index >= offsets[mid]) {
+ lo = mid;
+ n -= m;
+ } else {
+ n = m;
+ }
+ } while (n > 1);
+ return lo;
+}
+
static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t
new_capacity) {
int64_t doubled_capacity = current_capacity * 2;
if (doubled_capacity > new_capacity) {
diff --git a/src/nanoarrow/utils_test.cc b/src/nanoarrow/utils_test.cc
index d73d18a3..e2eef993 100644
--- a/src/nanoarrow/utils_test.cc
+++ b/src/nanoarrow/utils_test.cc
@@ -555,6 +555,18 @@ TEST(UtilsTest, ArrowResolveChunk64Test) {
EXPECT_EQ(ArrowResolveChunk64(5, offsets, 0, n_offsets), 2);
}
+TEST(UtilsTest, ArrowResolveChunk32Test) {
+ int32_t offsets[] = {0, 2, 3, 6};
+ int32_t n_offsets = 4;
+
+ EXPECT_EQ(ArrowResolveChunk32(0, offsets, 0, n_offsets), 0);
+ EXPECT_EQ(ArrowResolveChunk32(1, offsets, 0, n_offsets), 0);
+ EXPECT_EQ(ArrowResolveChunk32(2, offsets, 0, n_offsets), 1);
+ EXPECT_EQ(ArrowResolveChunk32(3, offsets, 0, n_offsets), 2);
+ EXPECT_EQ(ArrowResolveChunk32(4, offsets, 0, n_offsets), 2);
+ EXPECT_EQ(ArrowResolveChunk32(5, offsets, 0, n_offsets), 2);
+}
+
TEST(MaybeTest, ConstructionAndConversion) {
using nanoarrow::NA;
using nanoarrow::internal::Maybe;