This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 16c8df7 Implement conversion from ArrowArray to R vector for basic
types (#50)
16c8df7 is described below
commit 16c8df72adc4939a6af70c0e83d37176195069b5
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Sep 29 15:54:47 2022 -0300
Implement conversion from ArrowArray to R vector for basic types (#50)
* first stab at ALTREP
* tweak entry point to altrep
* incremental progress on methods
* add some R accessors for altrep stuff
* fix helpers
* more altrep tweaks
* more polished altrep conversion
* move materializer to its own thing
* infrastructure for exporting independent arrays
* first try at exporting columns of a record batch
* working but probably with memory leak
* remove the bytes checking (leave that to valgrind)
* maybe run valgrind on the R package
* maybe fix workflow
* suppress jemalloc tls on valgrind check
* use our own conversion for data.frame and character
* ensure we attach a clean array to the altrep object
* fix S3
* skip valgrind for now on CI
* also materialize integers
* move integer altrep tests that aren't actually altrep
* start on infra
* a bit more concrete type inference
* using our own infer infrastructure
* with our own conversions to R
* align vector type names, format
* more double conversion
* better error messages
* remove some unused altrep infrastructure
* better errors for ptype inference
* basic logical conversion
* document some things
* add vector generators
* fix NA generation
* fix call in errors
* tibble support, fix vec_gen for cmd check
* basic binary/list support
---
.github/workflows/r-check.yaml | 14 +
r/DESCRIPTION | 1 +
r/NAMESPACE | 3 +
r/R/altrep.R | 19 ++
r/R/array-convert-vector.R | 127 +++++++++
r/R/array.R | 38 +--
r/R/util.R | 38 +++
r/R/zzz.R | 2 +
r/man/as_nanoarrow_array.Rd | 8 -
r/man/from_nanoarrow_array.Rd | 25 ++
r/src/altrep.c | 216 +++++++++++++++
r/src/altrep.h | 61 +++++
r/src/array.c | 52 +---
r/src/array.h | 82 +++++-
r/src/array_convert_vector.c | 359 +++++++++++++++++++++++++
r/src/array_stream.h | 15 +-
r/src/array_view.c | 66 +++++
r/src/array_view.h | 52 ++++
r/src/init.c | 20 +-
r/src/materialize.c | 278 ++++++++++++++++++++
r/src/materialize.h | 36 +++
r/src/{pointers-cpp.cc => pointers_cpp.cc} | 0
r/src/schema.c | 5 +
r/src/schema.h | 12 +
r/tests/testthat/test-altrep.R | 82 ++++++
r/tests/testthat/test-array-convert-vector.R | 375 +++++++++++++++++++++++++++
r/tests/testthat/test-array.R | 11 +-
r/tests/testthat/test-pkg-arrow.R | 43 +++
r/tests/testthat/test-util.R | 19 ++
r/tools/make-callentries.R | 1 +
30 files changed, 1966 insertions(+), 94 deletions(-)
diff --git a/.github/workflows/r-check.yaml b/.github/workflows/r-check.yaml
index cfd95e3..059afc2 100644
--- a/.github/workflows/r-check.yaml
+++ b/.github/workflows/r-check.yaml
@@ -56,6 +56,10 @@ jobs:
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true
+ - name: Install valgrind on r-devel
+ if: matrix.config.r == 'devel'
+ run: sudo apt-get install -y valgrind
+
# Probably a better way to do this, but for now do the vendor step
manually
- name: Vendor nanoarrow into the R package
run: |
@@ -67,9 +71,19 @@ jobs:
extra-packages: any::rcmdcheck
needs: check
working-directory: r
+
- uses: r-lib/actions/check-r-package@v2
env:
ARROW_R_VERBOSE_TEST: "true"
with:
upload-snapshots: true
working-directory: r
+
+ # Something about Arrow installed via GHA doesn't work with valgrind
+ # so we can't run it on CI :(
+ # - name: Run valgrind on r-devel
+ # if: matrix.config.r == 'devel'
+ # run: |
+ # cd r
+ # R CMD INSTALL .
+ # echo "testthat::test_local()" | R --no-save -d "valgrind
--tool=memcheck --leak-check=full --suppressions ../valgrind.supp"
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 49239ba..d84baf4 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -24,5 +24,6 @@ BugReports: https://github.com/apache/arrow-nanoarrow/issues
Suggests:
arrow (>= 8.0.0),
testthat (>= 3.0.0),
+ tibble,
vctrs
Config/testthat/edition: 3
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 14da83f..1f285a3 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -26,6 +26,8 @@ S3method(format,nanoarrow_array_stream)
S3method(format,nanoarrow_buffer)
S3method(format,nanoarrow_schema)
S3method(from_nanoarrow_array,default)
+S3method(from_nanoarrow_array,tbl_df)
+S3method(from_nanoarrow_array,vctrs_partial_frame)
S3method(infer_nanoarrow_schema,default)
S3method(infer_nanoarrow_schema,nanoarrow_array)
S3method(infer_nanoarrow_schema,nanoarrow_array_stream)
@@ -47,6 +49,7 @@ export(as_nanoarrow_array)
export(as_nanoarrow_array_stream)
export(as_nanoarrow_schema)
export(from_nanoarrow_array)
+export(infer_nanoarrow_ptype)
export(infer_nanoarrow_schema)
export(nanoarrow_allocate_array)
export(nanoarrow_allocate_array_stream)
diff --git a/r/R/altrep.R b/r/R/altrep.R
new file mode 100644
index 0000000..ce3e40b
--- /dev/null
+++ b/r/R/altrep.R
@@ -0,0 +1,19 @@
+
+# For testing the altrep chr conversion
+nanoarrow_altrep_chr <- function(array) {
+ schema <- infer_nanoarrow_schema(array)
+ array_view <- .Call(nanoarrow_c_array_view, array, schema)
+ .Call(nanoarrow_c_make_altrep_chr, array_view)
+}
+
+is_nanoarrow_altrep <- function(x) {
+ .Call(nanoarrow_c_is_altrep, x)
+}
+
+nanoarrow_altrep_force_materialize <- function(x, recursive = FALSE) {
+ invisible(.Call(nanoarrow_c_altrep_force_materialize, x, recursive))
+}
+
+is_nanoarrow_altrep_materialized <- function(x) {
+ .Call(nanoarrow_c_altrep_is_materialized, x)
+}
diff --git a/r/R/array-convert-vector.R b/r/R/array-convert-vector.R
new file mode 100644
index 0000000..3209fc4
--- /dev/null
+++ b/r/R/array-convert-vector.R
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' Convert an Array to an R vector
+#'
+#' @param array A [nanoarrow_array][as_nanoarrow_array].
+#' @param to A target prototype object describing the type to which `array`
+#' should be converted, or `NULL` to use the default conversion.
+#' @param ... Passed to S3 methods
+#'
+#' @return An R vector of type `to`.
+#' @export
+#'
+from_nanoarrow_array <- function(array, to = NULL, ...) {
+ stopifnot(inherits(array, "nanoarrow_array"))
+ UseMethod("from_nanoarrow_array", to)
+}
+
+#' @export
+from_nanoarrow_array.default <- function(array, to = NULL, ..., .from_c =
FALSE) {
+ if (.from_c) {
+ stop_cant_convert_array(array, to)
+ }
+
+ .Call(nanoarrow_c_from_array, array, to)
+}
+
+# This is defined because it's verbose to pass named arguments from C.
+# When converting data frame columns, we try the internal C conversions
+# first to save R evaluation overhead. When the internal conversions fail,
+# we call from_nanoarrow_array() to dispatch to conversions defined via S3
+# dispatch, making sure to let the default method know that we've already
+# tried the internal C conversions.
+from_nanoarrow_array_from_c <- function(array, to) {
+ from_nanoarrow_array(array, to, .from_c = TRUE)
+}
+
+#' @export
+from_nanoarrow_array.vctrs_partial_frame <- function(array, to, ...) {
+ ptype <- infer_nanoarrow_ptype(array)
+ if (!is.data.frame(ptype)) {
+ stop_cant_convert_array(array, to)
+ }
+
+ ptype <- vctrs::vec_ptype_common(ptype, to)
+ .Call(nanoarrow_c_from_array, array, ptype)
+}
+
+#' @export
+from_nanoarrow_array.tbl_df <- function(array, to, ...) {
+ df <- from_nanoarrow_array(array, as.data.frame(to))
+ tibble::as_tibble(df)
+}
+
+#' @rdname from_nanoarrow_array
+#' @export
+infer_nanoarrow_ptype <- function(array) {
+ stopifnot(inherits(array, "nanoarrow_array"))
+ .Call(nanoarrow_c_infer_ptype, array)
+}
+
+stop_cant_infer_ptype <- function(array) {
+ schema <- infer_nanoarrow_schema(array)
+
+ if (is.null(schema$name) || identical(schema$name, "")) {
+ cnd <- simpleError(
+ sprintf(
+ "Can't infer R vector type for array <%s>",
+ schema$format
+ ),
+ call = sys.call(-1)
+ )
+ } else {
+ cnd <- simpleError(
+ sprintf(
+ "Can't infer R vector type for `%s` <%s>",
+ schema$name,
+ schema$format
+ ),
+ call = sys.call(-1)
+ )
+ }
+
+ stop(cnd)
+}
+
+stop_cant_convert_array <- function(array, to) {
+ schema <- infer_nanoarrow_schema(array)
+
+ if (is.null(schema$name) || identical(schema$name, "")) {
+ cnd <- simpleError(
+ sprintf(
+ "Can't convert array <%s> to R vector of type %s",
+ schema$format,
+ class(to)[1]
+ ),
+ call = sys.call(-1)
+ )
+ } else {
+ cnd <- simpleError(
+ sprintf(
+ "Can't convert `%s` <%s> to R vector of type %s",
+ schema$name,
+ schema$format,
+ class(to)[1]
+ ),
+ call = sys.call(-1)
+ )
+ }
+
+ stop(cnd)
+}
diff --git a/r/R/array.R b/r/R/array.R
index f8e5612..f7c0481 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -27,9 +27,6 @@
#' @param x An object to convert to a array
#' @param schema An optional schema used to enforce conversion to a particular
#' type. Defaults to [infer_nanoarrow_schema()].
-#' @param to A target prototype object describing the type to which `array`
-#' should be converted.
-#' @param array An object of class 'nanoarrow_array'
#' @param ... Passed to S3 methods
#'
#' @return An object of class 'nanoarrow_array'
@@ -46,22 +43,25 @@ as_nanoarrow_array <- function(x, ..., schema = NULL) {
UseMethod("as_nanoarrow_array")
}
-#' @rdname as_nanoarrow_array
-#' @export
-from_nanoarrow_array <- function(array, to = NULL, ...) {
- stopifnot(inherits(array, "nanoarrow_array"))
- UseMethod("from_nanoarrow_array", to)
-}
-
#' @export
as.vector.nanoarrow_array <- function(x, mode = "any") {
stopifnot(identical(mode, "any"))
- from_nanoarrow_array(x)
+ from_nanoarrow_array(x, to = infer_nanoarrow_ptype(x))
}
#' @export
as.data.frame.nanoarrow_array <- function(x, ...) {
- from_nanoarrow_array(x, to = vctrs::partial_frame())
+ schema <- infer_nanoarrow_schema(x)
+ if (schema$format != "+s") {
+ stop(sprintf("Can't convert array with schema '%s' to data.frame()",
schema$format))
+ }
+
+ .Call(nanoarrow_c_from_array, x, NULL)
+}
+
+# exported in zzz.R
+as_tibble.nanoarrow_array <- function(x, ...) {
+ tibble::as_tibble(as.data.frame.nanoarrow_array(x), ...)
}
#' @export
@@ -75,20 +75,6 @@ as_nanoarrow_array.default <- function(x, ..., schema =
NULL) {
}
}
-#' @export
-from_nanoarrow_array.default <- function(array, to = NULL, ...) {
- # For now, use arrow's conversion for everything
- result <- as.vector(arrow::as_arrow_array(array))
-
- # arrow's conversion doesn't support `to`, so for now use an R cast
- # workaround for a bug in vctrs: https://github.com/r-lib/vctrs/issues/1642
- if (inherits(result, "tbl_df")) {
- result <- new_data_frame(result, nrow(result))
- }
-
- vctrs::vec_cast(result, to)
-}
-
#' @export
infer_nanoarrow_schema.nanoarrow_array <- function(x, ...) {
.Call(nanoarrow_c_infer_schema_array, x) %||%
diff --git a/r/R/util.R b/r/R/util.R
index 06e638b..8fb1ed0 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -22,3 +22,41 @@
new_data_frame <- function(x, nrow) {
structure(x, row.names = c(NA, nrow), class = "data.frame")
}
+
+vec_gen <- function(ptype, n = 1e3, prop_true = 0.5, prop_na = 0,
+ chr_len = function(n) ceiling(25 * stats::runif(n))) {
+ vec <- switch(
+ class(ptype)[1],
+ logical = stats::runif(n) < prop_true,
+ integer = as.integer(stats::runif(n, min = -1, max = 1) *
.Machine$integer.max),
+ numeric = stats::runif(n),
+ character = strrep(rep_len(letters, n), chr_len(n)),
+ data.frame = new_data_frame(
+ lapply(
+ ptype,
+ vec_gen,
+ n = n,
+ prop_true = prop_true,
+ prop_na = prop_na,
+ chr_len = chr_len
+ ),
+ n
+ ),
+ stop(sprintf("Don't know how to generate vector for type %s",
class(ptype)[1]))
+ )
+
+ if (!is.data.frame(vec) && prop_na > 0) {
+ is_na <- stats::runif(n) < prop_na
+ vec[is_na] <- ptype[NA_integer_]
+ }
+
+ vec
+}
+
+vec_shuffle <- function(x) {
+ if (is.data.frame(x)) {
+ x[sample(seq_len(nrow(x)), replace = FALSE), , drop = FALSE]
+ } else {
+ x[sample(seq_along(x), replace = FALSE)]
+ }
+}
diff --git a/r/R/zzz.R b/r/R/zzz.R
index 54f9c9a..e2dd492 100644
--- a/r/R/zzz.R
+++ b/r/R/zzz.R
@@ -25,6 +25,8 @@
s3_register("arrow::as_record_batch", "nanoarrow_array")
s3_register("arrow::as_arrow_table", "nanoarrow_array")
s3_register("arrow::as_record_batch_reader", "nanoarrow_array_stream")
+
+ s3_register("tibble::as_tibble", "nanoarrow_array")
}
# From the `vctrs` package (this function is intended to be copied
diff --git a/r/man/as_nanoarrow_array.Rd b/r/man/as_nanoarrow_array.Rd
index 353dfa0..fccf5cc 100644
--- a/r/man/as_nanoarrow_array.Rd
+++ b/r/man/as_nanoarrow_array.Rd
@@ -2,12 +2,9 @@
% Please edit documentation in R/array.R
\name{as_nanoarrow_array}
\alias{as_nanoarrow_array}
-\alias{from_nanoarrow_array}
\title{Convert an object to a nanoarrow array}
\usage{
as_nanoarrow_array(x, ..., schema = NULL)
-
-from_nanoarrow_array(array, to = NULL, ...)
}
\arguments{
\item{x}{An object to convert to a array}
@@ -16,11 +13,6 @@ from_nanoarrow_array(array, to = NULL, ...)
\item{schema}{An optional schema used to enforce conversion to a particular
type. Defaults to
\code{\link[=infer_nanoarrow_schema]{infer_nanoarrow_schema()}}.}
-
-\item{array}{An object of class 'nanoarrow_array'}
-
-\item{to}{A target prototype object describing the type to which \code{array}
-should be converted.}
}
\value{
An object of class 'nanoarrow_array'
diff --git a/r/man/from_nanoarrow_array.Rd b/r/man/from_nanoarrow_array.Rd
new file mode 100644
index 0000000..e4f8d70
--- /dev/null
+++ b/r/man/from_nanoarrow_array.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/array-convert-vector.R
+\name{from_nanoarrow_array}
+\alias{from_nanoarrow_array}
+\alias{infer_nanoarrow_ptype}
+\title{Convert an Array to an R vector}
+\usage{
+from_nanoarrow_array(array, to = NULL, ...)
+
+infer_nanoarrow_ptype(array)
+}
+\arguments{
+\item{array}{A \link[=as_nanoarrow_array]{nanoarrow_array}.}
+
+\item{to}{A target prototype object describing the type to which \code{array}
+should be converted, or \code{NULL} to use the default conversion.}
+
+\item{...}{Passed to S3 methods}
+}
+\value{
+An R vector of type \code{to}.
+}
+\description{
+Convert an Array to an R vector
+}
diff --git a/r/src/altrep.c b/r/src/altrep.c
new file mode 100644
index 0000000..dbb9e39
--- /dev/null
+++ b/r/src/altrep.c
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "altrep.h"
+#include "array.h"
+#include "nanoarrow.h"
+
+#include "materialize.h"
+
+#ifdef HAS_ALTREP
+
+// This file defines all ALTREP classes used to speed up conversion
+// from an arrow_array to an R vector. Currently only string and
+// large string arrays are converted to ALTREP.
+//
+// All ALTREP classes follow some common patterns:
+//
+// - R_altrep_data1() holds an external pointer to a struct ArrowArrayView
+// - R_altrep_data2() holds the materialized version of the vector.
+// - When materialization happens, we set R_altrep_data1() to R_NilValue
+// to ensure we don't hold on to any more resources than needed.
+
+static R_xlen_t nanoarrow_altrep_length(SEXP altrep_sexp) {
+ SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
+ if (array_view_xptr == R_NilValue) {
+ return Rf_xlength(R_altrep_data2(altrep_sexp));
+ }
+
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+ return array_view->array->length;
+}
+
+static Rboolean nanoarrow_altrep_inspect(SEXP altrep_sexp, int pre, int deep,
int pvec,
+ void (*inspect_subtree)(SEXP, int,
int, int)) {
+ SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
+ const char* materialized = "";
+ if (array_view_xptr == R_NilValue) {
+ materialized = "materialized ";
+ }
+
+ R_xlen_t len = nanoarrow_altrep_length(altrep_sexp);
+ const char* class_name = nanoarrow_altrep_class(altrep_sexp);
+ Rprintf("<%s%s[%ld]>\n", materialized, class_name, (long)len);
+ return TRUE;
+}
+
+static SEXP nanoarrow_altstring_elt(SEXP altrep_sexp, R_xlen_t i) {
+ SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
+ if (array_view_xptr == R_NilValue) {
+ return STRING_ELT(R_altrep_data2(altrep_sexp), i);
+ }
+
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+
+ if (ArrowArrayViewIsNull(array_view, i)) {
+ return NA_STRING;
+ }
+
+ struct ArrowStringView item = ArrowArrayViewGetStringUnsafe(array_view, i);
+ return Rf_mkCharLenCE(item.data, item.n_bytes, CE_UTF8);
+}
+
+static SEXP nanoarrow_altstring_materialize(SEXP altrep_sexp) {
+ SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
+ if (array_view_xptr == R_NilValue) {
+ return R_altrep_data2(altrep_sexp);
+ }
+
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+
+ SEXP result = PROTECT(nanoarrow_materialize_chr(array_view));
+ R_set_altrep_data2(altrep_sexp, result);
+ R_set_altrep_data1(altrep_sexp, R_NilValue);
+ UNPROTECT(1);
+ return result;
+}
+
+static void* nanoarrow_altrep_dataptr(SEXP altrep_sexp, Rboolean writable) {
+ return DATAPTR(nanoarrow_altstring_materialize(altrep_sexp));
+}
+
+static const void* nanoarrow_altrep_dataptr_or_null(SEXP altrep_sexp) {
+ SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
+ if (array_view_xptr == R_NilValue) {
+ return DATAPTR_OR_NULL(R_altrep_data2(altrep_sexp));
+ }
+
+ return NULL;
+}
+
+static R_altrep_class_t nanoarrow_altrep_chr_cls;
+
+#endif
+
+static void register_nanoarrow_altstring(DllInfo* info) {
+#ifdef HAS_ALTREP
+ nanoarrow_altrep_chr_cls =
+ R_make_altstring_class("nanoarrow::altrep_chr", "nanoarrow", info);
+ R_set_altrep_Length_method(nanoarrow_altrep_chr_cls,
&nanoarrow_altrep_length);
+ R_set_altrep_Inspect_method(nanoarrow_altrep_chr_cls,
&nanoarrow_altrep_inspect);
+ R_set_altvec_Dataptr_or_null_method(nanoarrow_altrep_chr_cls,
+ &nanoarrow_altrep_dataptr_or_null);
+ R_set_altvec_Dataptr_method(nanoarrow_altrep_chr_cls,
&nanoarrow_altrep_dataptr);
+
+ R_set_altstring_Elt_method(nanoarrow_altrep_chr_cls,
&nanoarrow_altstring_elt);
+
+ // Notes about other available methods:
+ //
+ // - The no_na method never seems to get called (anyNA() doesn't seem to
+ // use it)
+ // - Because set_Elt is not defined, SET_STRING_ELT() will modify the
+ // technically modify the materialized value. The object has been marked
+ // immutable but in the case of a string this is fine because we
materialize
+ // when this happens (via Dataptr).
+ // - It may be beneficial to implement the Extract_subset method to defer
string
+ // conversion even longer since this is expensive compared to rearranging
integer
+ // indices.
+#endif
+}
+
+void register_nanoarrow_altrep(DllInfo* info) {
register_nanoarrow_altstring(info); }
+
+SEXP nanoarrow_c_make_altrep_chr(SEXP array_view_xptr) {
+#ifdef HAS_ALTREP
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_LARGE_STRING:
+ break;
+ default:
+ return R_NilValue;
+ }
+
+ // Ensure the array that we're attaching to this ALTREP object does not keep
its
+ // parent struct alive unnecessarily (i.e., a user can select only a few
columns
+ // and the memory for the unused columns will be released).
+ SEXP array_xptr_independent =
+
PROTECT(array_xptr_ensure_independent(R_ExternalPtrProtected(array_view_xptr)));
+ array_view->array = array_from_xptr(array_xptr_independent);
+ R_SetExternalPtrProtected(array_view_xptr, array_xptr_independent);
+ UNPROTECT(1);
+
+ Rf_setAttrib(array_view_xptr, R_ClassSymbol,
Rf_mkString("nanoarrow::altrep_chr"));
+ SEXP out = PROTECT(R_new_altrep(nanoarrow_altrep_chr_cls, array_view_xptr,
R_NilValue));
+ MARK_NOT_MUTABLE(out);
+ UNPROTECT(1);
+ return out;
+#else
+ return R_NilValue;
+#endif
+}
+
+SEXP nanoarrow_c_is_altrep(SEXP x_sexp) {
+ return Rf_ScalarLogical(is_nanoarrow_altrep(x_sexp));
+}
+
+SEXP nanoarrow_c_altrep_is_materialized(SEXP x_sexp) {
+ const char* class_name = nanoarrow_altrep_class(x_sexp);
+ if (class_name == NULL || strncmp(class_name, "nanoarrow::", 11) != 0) {
+ return Rf_ScalarLogical(NA_LOGICAL);
+ } else {
+ return Rf_ScalarLogical(R_altrep_data1(x_sexp) == R_NilValue);
+ }
+}
+
+SEXP nanoarrow_c_altrep_force_materialize(SEXP x_sexp, SEXP recursive_sexp) {
+ // The recursive flag lets a developer/user force materialization of any
+ // string columns in a data.frame that came from nanoarrow.
+ if (Rf_inherits(x_sexp, "data.frame") && LOGICAL(recursive_sexp)[0]) {
+ int n_materialized = 0;
+ for (R_xlen_t i = 0; i < Rf_xlength(x_sexp); i++) {
+ SEXP n_materialized_sexp = PROTECT(
+ nanoarrow_c_altrep_force_materialize(VECTOR_ELT(x_sexp, i),
recursive_sexp));
+ n_materialized += INTEGER(n_materialized_sexp)[0];
+ UNPROTECT(1);
+ }
+ return Rf_ScalarInteger(n_materialized);
+ }
+
+ const char* class_name = nanoarrow_altrep_class(x_sexp);
+ if (class_name && strcmp(class_name, "nanoarrow::altrep_chr") == 0) {
+ // Force materialization even if already materialized (the method
+ // should be safe to call more than once as written here)
+ int already_materialized = R_altrep_data1(x_sexp) == R_NilValue;
+ nanoarrow_altstring_materialize(x_sexp);
+ return Rf_ScalarInteger(!already_materialized);
+ } else {
+ return Rf_ScalarInteger(0);
+ }
+}
diff --git a/r/src/altrep.h b/r/src/altrep.h
new file mode 100644
index 0000000..ef01b21
--- /dev/null
+++ b/r/src/altrep.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_ALTREP_H_INCLUDED
+#define R_ALTREP_H_INCLUDED
+
+#include "Rversion.h"
+
+#include <string.h>
+
+// ALTREP available in R >= 3.5
+#if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0)
+
+#define HAS_ALTREP
+#include <R_ext/Altrep.h>
+
+// Returns the ALTREP class name or NULL if x is not an altrep
+// object.
+static inline const char* nanoarrow_altrep_class(SEXP x) {
+ if (ALTREP(x)) {
+ SEXP data_class_sym = CAR(ATTRIB(ALTREP_CLASS(x)));
+ return CHAR(PRINTNAME(data_class_sym));
+ } else {
+ return NULL;
+ }
+}
+
+#else
+
+static inline const char* nanoarrow_altrep_class(SEXP x) { return NULL; }
+
+#endif
+
+// Performs the ALTREP type registration and should be called on package load
+void register_nanoarrow_altrep(DllInfo* info);
+
+// Checks if an object is an ALTREP object created by this package
+static inline int is_nanoarrow_altrep(SEXP x) {
+ const char* class_name = nanoarrow_altrep_class(x);
+ return class_name && strncmp(class_name, "nanoarrow::", 11) == 0;
+}
+
+// Creates an altstring vector backed by a nanoarrow array or returns
+// R_NilValue if the conversion is not possible.
+SEXP nanoarrow_c_make_altrep_chr(SEXP array_view_xptr);
+
+#endif
diff --git a/r/src/array.c b/r/src/array.c
index b4d5777..b98b331 100644
--- a/r/src/array.c
+++ b/r/src/array.c
@@ -82,6 +82,15 @@ static SEXP borrow_array_xptr(struct ArrowArray* array, SEXP
shelter) {
return array_xptr;
}
+SEXP borrow_array_child_xptr(SEXP array_xptr, int64_t i) {
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ SEXP schema_xptr = R_ExternalPtrTag(array_xptr);
+ SEXP child_xptr = PROTECT(borrow_array_xptr(array->children[i], array_xptr));
+ array_xptr_set_schema(child_xptr, borrow_schema_child_xptr(schema_xptr, i));
+ UNPROTECT(1);
+ return child_xptr;
+}
+
static SEXP borrow_array_view_child(struct ArrowArrayView* array_view, int64_t
i,
SEXP shelter) {
if (array_view != NULL) {
@@ -213,7 +222,8 @@ static SEXP borrow_buffer(struct ArrowArrayView*
array_view, int64_t i, SEXP she
const char* names[] = {"size_bytes", "element_size_bits", ""};
SEXP buffer_info = PROTECT(Rf_mkNamed(VECSXP, names));
SET_VECTOR_ELT(buffer_info, 0,
length_from_int64(array_view->buffer_views[i].n_bytes));
- SET_VECTOR_ELT(buffer_info, 1,
length_from_int64(array_view->layout.element_size_bits[i]));
+ SET_VECTOR_ELT(buffer_info, 1,
+ length_from_int64(array_view->layout.element_size_bits[i]));
SEXP buffer =
PROTECT(R_MakeExternalPtr((void*)array_view->buffer_views[i].data.data,
buffer_info, shelter));
@@ -222,42 +232,6 @@ static SEXP borrow_buffer(struct ArrowArrayView*
array_view, int64_t i, SEXP she
return buffer;
}
-static void finalize_array_view_xptr(SEXP array_view_xptr) {
- struct ArrowArrayView* array_view =
- (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
- if (array_view != NULL) {
- ArrowArrayViewReset(array_view);
- ArrowFree(array_view);
- }
-}
-
-SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr) {
- struct ArrowArray* array = array_from_xptr(array_xptr);
- struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
-
- struct ArrowError error;
- ArrowErrorSet(&error, "");
-
- struct ArrowArrayView* array_view =
- (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView));
- ArrowArrayViewInit(array_view, NANOARROW_TYPE_UNINITIALIZED);
- SEXP xptr = PROTECT(R_MakeExternalPtr(array_view, R_NilValue, array_xptr));
- R_RegisterCFinalizer(xptr, &finalize_array_view_xptr);
-
- int result = ArrowArrayViewInitFromSchema(array_view, schema, &error);
- if (result != NANOARROW_OK) {
- Rf_error("<ArrowArrayViewInitFromSchema> %s", error.message);
- }
-
- result = ArrowArrayViewSetArray(array_view, array, &error);
- if (result != NANOARROW_OK) {
- Rf_error("<ArrowArrayViewSetArray> %s", error.message);
- }
-
- UNPROTECT(1);
- return xptr;
-}
-
SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP array_view_xptr, SEXP
recursive_sexp) {
struct ArrowArray* array = array_from_xptr(array_xptr);
int recursive = LOGICAL(recursive_sexp)[0];
@@ -322,9 +296,5 @@ SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP
array_view_xptr, SEXP recursi
void finalize_exported_array(struct ArrowArray* array) {
SEXP array_xptr = (SEXP)array->private_data;
R_ReleaseObject(array_xptr);
-
- // TODO: properly relocate child arrays
- //
https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays
-
array->release = NULL;
}
diff --git a/r/src/array.h b/r/src/array.h
index d513e1a..f974450 100644
--- a/r/src/array.h
+++ b/r/src/array.h
@@ -26,9 +26,18 @@
void finalize_array_xptr(SEXP array_xptr);
void finalize_exported_array(struct ArrowArray* array);
+// Returns an external pointer to an array child with a schema attached.
+// The returned pointer will keep its parent alive unless passed through
+// array_xptr_ensure_independent(). This is typically what you want when
+// printing or performing a conversion, where the borrowed external pointer
+// is ephemeral.
+SEXP borrow_array_child_xptr(SEXP array_xptr, int64_t i);
+
+// Returns the underlying struct ArrowArray* from an external pointer,
+// checking and erroring for invalid objects, pointers, and arrays.
static inline struct ArrowArray* array_from_xptr(SEXP array_xptr) {
if (!Rf_inherits(array_xptr, "nanoarrow_array")) {
- Rf_error("`array` argument that is not");
+ Rf_error("`array` argument that is not a nanoarrow_array()");
}
struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
@@ -43,6 +52,9 @@ static inline struct ArrowArray* array_from_xptr(SEXP
array_xptr) {
return array;
}
+// Returns the underlying struct ArrowArray* from an external pointer,
+// checking and erroring for invalid objects, pointers, and arrays, but
+// allowing for R_NilValue to signify a NULL return.
static inline struct ArrowArray* nullable_array_from_xptr(SEXP array_xptr) {
if (array_xptr == R_NilValue) {
return NULL;
@@ -51,6 +63,8 @@ static inline struct ArrowArray*
nullable_array_from_xptr(SEXP array_xptr) {
}
}
+// Create an external pointer with the proper class and that will release any
+// non-null, non-released pointer when garbage collected.
static inline SEXP array_owning_xptr() {
struct ArrowArray* array = (struct ArrowArray*)ArrowMalloc(sizeof(struct
ArrowArray));
array->release = NULL;
@@ -62,10 +76,14 @@ static inline SEXP array_owning_xptr() {
return array_xptr;
}
+// Attaches a schema to an array external pointer. The nanoarrow R package
+// attempts to do this whenever possible to avoid misinterpreting arrays.
static inline void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) {
R_SetExternalPtrTag(array_xptr, schema_xptr);
}
+// Retrieves a schema from an array external pointer if it exists or returns
+// NULL otherwise.
static inline struct ArrowSchema* schema_from_array_xptr(SEXP array_xptr) {
SEXP maybe_schema_xptr = R_ExternalPtrTag(array_xptr);
if (Rf_inherits(maybe_schema_xptr, "nanoarrow_schema")) {
@@ -75,16 +93,68 @@ static inline struct ArrowSchema*
schema_from_array_xptr(SEXP array_xptr) {
}
}
-static inline void array_export(SEXP array_xptr, struct ArrowArray*
array_copy) {
- struct ArrowArray* array = array_from_xptr(array_xptr);
+static inline SEXP array_xptr_ensure_independent(SEXP array_xptr);
- // keep all the pointers but use the R_PreserveObject mechanism to keep
+// Exports a version of the array pointed to by array_xptr to array_copy
+// such that (1) any R references to array_xptr are not invalidated if they
exist
+// and (2) array_copy->release() can be called independently without
invalidating
+// R references to array_xptr.
+static inline void array_export(SEXP array_xptr, struct ArrowArray*
array_copy) {
+ // If array_xptr has SEXP dependencies, this will ensure an independent
version
+ // It is possible that this should be done recursively, too, to ensure that
unused
+ // child arrays can be released by wherever this is being exported. This is
in the
+ // specification although it is unclear whether any implementation actually
does this.
+ SEXP independent_array_xptr =
PROTECT(array_xptr_ensure_independent(array_xptr));
+ struct ArrowArray* array = array_from_xptr(independent_array_xptr);
+
+ // Keep all the pointers but use the R_PreserveObject mechanism to keep
// the original data valid (R_ReleaseObject is called from the release
callback)
memcpy(array_copy, array, sizeof(struct ArrowArray));
- array_copy->private_data = array_xptr;
+ array_copy->private_data = independent_array_xptr;
array_copy->release = &finalize_exported_array;
- R_PreserveObject(array_xptr);
+ R_PreserveObject(independent_array_xptr);
+ UNPROTECT(1);
}
+// When arrays arrive as a nanoarrow_array, they are responsible for
+// releasing their children. This is fine until we need to keep one
+// child alive (e.g., a column of a data frame that we attach to an
+// ALTREP array) or until we need to export it (i.e., comply with
+// https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays
+// where child arrays must be movable). To make this work we need to do a
shuffle: we
+// move the child array to a new owning external pointer and
+// give an exported version back to the original object. This only
+// applies if the array_xptr has the external pointer 'prot' field
+// set (if it doesn't have that set, it is already independent).
+static inline SEXP array_ensure_independent(struct ArrowArray* array) {
+ SEXP original_array_xptr = PROTECT(array_owning_xptr());
+
+ // Move array to the newly created owner
+ struct ArrowArray* original_array =
+ (struct ArrowArray*)R_ExternalPtrAddr(original_array_xptr);
+ memcpy(original_array, array, sizeof(struct ArrowArray));
+ array->release = NULL;
+
+ // Export the independent array (which keeps a reference to
original_array_xptr)
+ // back to the original home
+ array_export(original_array_xptr, array);
+ UNPROTECT(1);
+
+ // Return the external pointer of the independent array
+ return original_array_xptr;
+}
+
+// This version is like the version that operates on a raw struct ArrowArray*
+// except it checks if this array has any array dependencies by inspecing the
'Protected'
+// field of the external pointer: if it that field is R_NilValue, it is already
+// independent.
+static inline SEXP array_xptr_ensure_independent(SEXP array_xptr) {
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ if (R_ExternalPtrProtected(array_xptr) == R_NilValue) {
+ return array_xptr;
+ }
+
+ return array_ensure_independent(array);
+}
#endif
diff --git a/r/src/array_convert_vector.c b/r/src/array_convert_vector.c
new file mode 100644
index 0000000..e4b30cc
--- /dev/null
+++ b/r/src/array_convert_vector.c
@@ -0,0 +1,359 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "altrep.h"
+#include "array.h"
+#include "array_view.h"
+#include "materialize.h"
+
+// These are the vector types that have some special casing
+// internally to avoid unnecessary allocations or looping at
+// the R level. Other types are represented by an SEXP ptype.
+enum VectorType {
+ VECTOR_TYPE_LGL,
+ VECTOR_TYPE_INT,
+ VECTOR_TYPE_DBL,
+ VECTOR_TYPE_CHR,
+ VECTOR_TYPE_LIST_OF_RAW,
+ VECTOR_TYPE_DATA_FRAME,
+ VECTOR_TYPE_OTHER
+};
+
+// These conversions are the default R-native type guesses for
+// an array that don't require extra information from the ptype (e.g.,
+// factor with levels). Some of these guesses may result in a conversion
+// that later warns for out-of-range values (e.g., int64 to double());
+// however, a user can use the from_nanoarrow_array(x, ptype =
something_safer())
+// when this occurs.
+static enum VectorType vector_type_from_array_type(enum ArrowType type) {
+ switch (type) {
+ case NANOARROW_TYPE_BOOL:
+ return VECTOR_TYPE_LGL;
+
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ case NANOARROW_TYPE_INT32:
+ return VECTOR_TYPE_INT;
+
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ case NANOARROW_TYPE_DOUBLE:
+ return VECTOR_TYPE_DBL;
+
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_LARGE_STRING:
+ return VECTOR_TYPE_CHR;
+
+ case NANOARROW_TYPE_BINARY:
+ case NANOARROW_TYPE_LARGE_BINARY:
+ return VECTOR_TYPE_LIST_OF_RAW;
+
+ case NANOARROW_TYPE_STRUCT:
+ return VECTOR_TYPE_DATA_FRAME;
+
+ default:
+ return VECTOR_TYPE_OTHER;
+ }
+}
+
+// The same as the above, but from a nanoarrow_array()
+static enum VectorType vector_type_from_array_xptr(SEXP array_xptr) {
+ struct ArrowSchema* schema = schema_from_array_xptr(array_xptr);
+
+ struct ArrowSchemaView schema_view;
+ struct ArrowError error;
+ if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) {
+ Rf_error("vector_type_from_array_view_xptr(): %s",
ArrowErrorMessage(&error));
+ }
+
+ return vector_type_from_array_type(schema_view.data_type);
+}
+
+// Call stop_cant_infer_ptype(), which gives a more informative error
+// message than we can provide in a reasonable amount of C code here
+static void call_stop_cant_infer_ptype(SEXP array_xptr) {
+ SEXP ns = PROTECT(R_FindNamespace(Rf_mkString("nanoarrow")));
+ SEXP call = PROTECT(Rf_lang2(Rf_install("stop_cant_infer_ptype"),
array_xptr));
+ Rf_eval(call, ns);
+ UNPROTECT(2);
+}
+
+SEXP nanoarrow_c_infer_ptype(SEXP array_xptr);
+
+static SEXP infer_ptype_data_frame(SEXP array_xptr) {
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ SEXP result = PROTECT(Rf_allocVector(VECSXP, array->n_children));
+ SEXP result_names = PROTECT(Rf_allocVector(STRSXP, array->n_children));
+
+ for (R_xlen_t i = 0; i < array->n_children; i++) {
+ SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
+ SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr));
+ UNPROTECT(1);
+
+ struct ArrowSchema* schema = schema_from_array_xptr(child_xptr);
+ if (schema->name != NULL) {
+ SET_STRING_ELT(result_names, i, Rf_mkCharCE(schema->name, CE_UTF8));
+ } else {
+ SET_STRING_ELT(result_names, i, Rf_mkChar(""));
+ }
+ }
+
+ Rf_setAttrib(result, R_ClassSymbol, Rf_mkString("data.frame"));
+ Rf_setAttrib(result, R_NamesSymbol, result_names);
+ SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
+ INTEGER(rownames)[0] = NA_INTEGER;
+ INTEGER(rownames)[1] = 0;
+ Rf_setAttrib(result, R_RowNamesSymbol, rownames);
+ UNPROTECT(3);
+ return result;
+}
+
+SEXP nanoarrow_c_infer_ptype(SEXP array_xptr) {
+ enum VectorType vector_type = vector_type_from_array_xptr(array_xptr);
+
+ switch (vector_type) {
+ case VECTOR_TYPE_LGL:
+ return Rf_allocVector(LGLSXP, 0);
+ case VECTOR_TYPE_INT:
+ return Rf_allocVector(INTSXP, 0);
+ case VECTOR_TYPE_DBL:
+ return Rf_allocVector(REALSXP, 0);
+ case VECTOR_TYPE_CHR:
+ return Rf_allocVector(STRSXP, 0);
+ case VECTOR_TYPE_DATA_FRAME:
+ return infer_ptype_data_frame(array_xptr);
+ default:
+ call_stop_cant_infer_ptype(array_xptr);
+ }
+
+ return R_NilValue;
+}
+
+// This calls from_nanoarrow_array() (via a package helper) to try S3
+// dispatch to find a from_nanoarrow_array() method (or error if there
+// isn't one)
+static SEXP call_from_nanoarrow_array(SEXP array_xptr, SEXP ptype_sexp) {
+ SEXP ns = PROTECT(R_FindNamespace(Rf_mkString("nanoarrow")));
+ SEXP call = PROTECT(
+ Rf_lang3(Rf_install("from_nanoarrow_array_from_c"), array_xptr,
ptype_sexp));
+ SEXP result = PROTECT(Rf_eval(call, ns));
+ UNPROTECT(3);
+ return result;
+}
+
+// Call stop_cant_convert_array(), which gives a more informative error
+// message than we can provide in a reasonable amount of C code here
+static void call_stop_cant_convert_array(SEXP array_xptr, int sexp_type) {
+ SEXP ns = PROTECT(R_FindNamespace(Rf_mkString("nanoarrow")));
+ SEXP ptype_sexp = PROTECT(Rf_allocVector(sexp_type, 0));
+ SEXP call =
+ PROTECT(Rf_lang3(Rf_install("stop_cant_convert_array"), array_xptr,
ptype_sexp));
+ Rf_eval(call, ns);
+ UNPROTECT(3);
+}
+
+SEXP nanoarrow_c_from_array(SEXP array_xptr, SEXP ptype_sexp);
+
+static SEXP from_array_to_data_frame(SEXP array_xptr, SEXP ptype_sexp) {
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ R_xlen_t n_col = array->n_children;
+ SEXP result = PROTECT(Rf_allocVector(VECSXP, n_col));
+
+ if (ptype_sexp == R_NilValue) {
+ SEXP result_names = PROTECT(Rf_allocVector(STRSXP, n_col));
+
+ for (R_xlen_t i = 0; i < n_col; i++) {
+ SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
+ SET_VECTOR_ELT(result, i, nanoarrow_c_from_array(child_xptr,
R_NilValue));
+ UNPROTECT(1);
+
+ struct ArrowSchema* schema = schema_from_array_xptr(child_xptr);
+ if (schema->name != NULL) {
+ SET_STRING_ELT(result_names, i, Rf_mkCharCE(schema->name, CE_UTF8));
+ } else {
+ SET_STRING_ELT(result_names, i, Rf_mkChar(""));
+ }
+ }
+
+ Rf_setAttrib(result, R_NamesSymbol, result_names);
+ UNPROTECT(1);
+ } else {
+ if (n_col != Rf_xlength(ptype_sexp)) {
+ Rf_error("Expected data.frame() ptype with %ld column(s) but found %ld
column(s)",
+ (long)n_col, (long)Rf_xlength(ptype_sexp));
+ }
+
+ for (R_xlen_t i = 0; i < n_col; i++) {
+ SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
+ SEXP child_ptype = VECTOR_ELT(ptype_sexp, i);
+ SET_VECTOR_ELT(result, i, nanoarrow_c_from_array(child_xptr,
child_ptype));
+ UNPROTECT(1);
+ }
+
+ Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype_sexp,
R_NamesSymbol));
+ }
+
+ Rf_setAttrib(result, R_ClassSymbol, Rf_mkString("data.frame"));
+ SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
+ INTEGER(rownames)[0] = NA_INTEGER;
+ INTEGER(rownames)[1] = array->length;
+ Rf_setAttrib(result, R_RowNamesSymbol, rownames);
+
+ UNPROTECT(2);
+ return result;
+}
+
+static SEXP from_array_to_lgl(SEXP array_xptr) {
+ SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
+ SEXP result =
PROTECT(nanoarrow_materialize_lgl(array_view_from_xptr(array_view_xptr)));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, LGLSXP);
+ }
+ UNPROTECT(2);
+ return result;
+}
+
+static SEXP from_array_to_int(SEXP array_xptr) {
+ SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
+ SEXP result =
PROTECT(nanoarrow_materialize_int(array_view_from_xptr(array_view_xptr)));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, INTSXP);
+ }
+ UNPROTECT(2);
+ return result;
+}
+
+static SEXP from_array_to_dbl(SEXP array_xptr) {
+ SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
+ SEXP result =
PROTECT(nanoarrow_materialize_dbl(array_view_from_xptr(array_view_xptr)));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, REALSXP);
+ }
+ UNPROTECT(2);
+ return result;
+}
+
+static SEXP from_array_to_chr(SEXP array_xptr) {
+ SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
+ SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_view_xptr));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, STRSXP);
+ }
+ UNPROTECT(2);
+ return result;
+}
+
+static SEXP from_array_to_list_of_raw(SEXP array_xptr) {
+ SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
+ SEXP result =
+
PROTECT(nanoarrow_materialize_list_of_raw(array_view_from_xptr(array_view_xptr)));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, STRSXP);
+ }
+ UNPROTECT(2);
+ return result;
+}
+
+// TODO: Lists are not all that well supported yet.
+static SEXP from_array_to_list(SEXP array_xptr, SEXP ptype_sexp) {
+ struct ArrowSchema* schema = schema_from_array_xptr(array_xptr);
+
+ struct ArrowSchemaView schema_view;
+ struct ArrowError error;
+ if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) {
+ Rf_error("from_array_to_list(): %s", ArrowErrorMessage(&error));
+ }
+
+ SEXP result = R_NilValue;
+ switch (schema_view.data_type) {
+ case NANOARROW_TYPE_BINARY:
+ case NANOARROW_TYPE_LARGE_BINARY:
+ result = PROTECT(from_array_to_list_of_raw(array_xptr));
+ break;
+ default:
+ call_stop_cant_convert_array(array_xptr, STRSXP);
+ }
+
+ UNPROTECT(1);
+ return result;
+}
+
+SEXP nanoarrow_c_from_array(SEXP array_xptr, SEXP ptype_sexp) {
+ // See if we can skip any ptype resolution at all
+ if (ptype_sexp == R_NilValue) {
+ enum VectorType vector_type = vector_type_from_array_xptr(array_xptr);
+ switch (vector_type) {
+ case VECTOR_TYPE_LGL:
+ return from_array_to_lgl(array_xptr);
+ case VECTOR_TYPE_INT:
+ return from_array_to_int(array_xptr);
+ case VECTOR_TYPE_DBL:
+ return from_array_to_dbl(array_xptr);
+ case VECTOR_TYPE_CHR:
+ return from_array_to_chr(array_xptr);
+ case VECTOR_TYPE_LIST_OF_RAW:
+ return from_array_to_list_of_raw(array_xptr);
+ case VECTOR_TYPE_DATA_FRAME:
+ return from_array_to_data_frame(array_xptr, R_NilValue);
+ default:
+ break;
+ }
+
+ // Otherwise, resolve the ptype and use it (this will also error
+ // for ptypes that can't be resolved)
+ ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr));
+ SEXP result = nanoarrow_c_from_array(array_xptr, ptype_sexp);
+ UNPROTECT(1);
+ return result;
+ }
+
+ // Handle some S3 objects internally to avoid S3 dispatch
+ // (e.g., when looping over a data frame with a lot of columns)
+ if (Rf_isObject(ptype_sexp)) {
+ if (Rf_inherits(ptype_sexp, "data.frame") && !Rf_inherits(ptype_sexp,
"tbl_df")) {
+ return from_array_to_data_frame(array_xptr, ptype_sexp);
+ } else {
+ return call_from_nanoarrow_array(array_xptr, ptype_sexp);
+ }
+ }
+
+ // If we're here, these are non-S3 objects
+ switch (TYPEOF(ptype_sexp)) {
+ case LGLSXP:
+ return from_array_to_lgl(array_xptr);
+ case INTSXP:
+ return from_array_to_int(array_xptr);
+ case REALSXP:
+ return from_array_to_dbl(array_xptr);
+ case STRSXP:
+ return from_array_to_chr(array_xptr);
+ case VECSXP:
+ return from_array_to_list(array_xptr, ptype_sexp);
+ default:
+ return call_from_nanoarrow_array(array_xptr, ptype_sexp);
+ }
+}
diff --git a/r/src/array_stream.h b/r/src/array_stream.h
index ecea6cb..d8bf71a 100644
--- a/r/src/array_stream.h
+++ b/r/src/array_stream.h
@@ -25,9 +25,11 @@
void finalize_array_stream_xptr(SEXP array_stream_xptr);
+// Returns the underlying struct ArrowSchema* from an external pointer,
+// checking and erroring for invalid objects, pointers, and arrays.
static inline struct ArrowArrayStream* array_stream_from_xptr(SEXP
array_stream_xptr) {
if (!Rf_inherits(array_stream_xptr, "nanoarrow_array_stream")) {
- Rf_error("`array_stream` argument that is not");
+ Rf_error("`array_stream` argument that is not a nanoarrow_array_stream()");
}
struct ArrowArrayStream* array_stream =
@@ -43,15 +45,8 @@ static inline struct ArrowArrayStream*
array_stream_from_xptr(SEXP array_stream_
return array_stream;
}
-static inline struct ArrowArrayStream* nullable_array_stream_from_xptr(
- SEXP array_stream_xptr) {
- if (array_stream_xptr == R_NilValue) {
- return NULL;
- } else {
- return array_stream_from_xptr(array_stream_xptr);
- }
-}
-
+// Create an external pointer with the proper class and that will release any
+// non-null, non-released pointer when garbage collected.
static inline SEXP array_stream_owning_xptr() {
struct ArrowArrayStream* array_stream =
(struct ArrowArrayStream*)ArrowMalloc(sizeof(struct ArrowArrayStream));
diff --git a/r/src/array_view.c b/r/src/array_view.c
new file mode 100644
index 0000000..2d536aa
--- /dev/null
+++ b/r/src/array_view.c
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "array.h"
+#include "schema.h"
+
+static void finalize_array_view_xptr(SEXP array_view_xptr) {
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+ if (array_view != NULL) {
+ ArrowArrayViewReset(array_view);
+ ArrowFree(array_view);
+ }
+}
+
+SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr) {
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+
+ struct ArrowError error;
+ ArrowErrorSet(&error, "");
+
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView));
+ ArrowArrayViewInit(array_view, NANOARROW_TYPE_UNINITIALIZED);
+ SEXP xptr = PROTECT(R_MakeExternalPtr(array_view, R_NilValue, array_xptr));
+ R_RegisterCFinalizer(xptr, &finalize_array_view_xptr);
+
+ int result = ArrowArrayViewInitFromSchema(array_view, schema, &error);
+ if (result != NANOARROW_OK) {
+ Rf_error("<ArrowArrayViewInitFromSchema> %s", error.message);
+ }
+
+ result = ArrowArrayViewSetArray(array_view, array, &error);
+ if (result != NANOARROW_OK) {
+ Rf_error("<ArrowArrayViewSetArray> %s", error.message);
+ }
+
+ Rf_setAttrib(xptr, R_ClassSymbol, Rf_mkString("nanoarrow_array_view"));
+ UNPROTECT(1);
+ return xptr;
+}
+
+SEXP array_view_xptr_from_array_xptr(SEXP array_xptr) {
+ return nanoarrow_c_array_view(array_xptr, R_ExternalPtrTag(array_xptr));
+}
diff --git a/r/src/array_view.h b/r/src/array_view.h
new file mode 100644
index 0000000..a2ab79c
--- /dev/null
+++ b/r/src/array_view.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_NANOARROW_ARRAY_VIEW_H_INCLUDED
+#define R_NANOARROW_ARRAY_VIEW_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+// Creates an external pointer to a struct ArrowArrayView, erroring
+// if the validation inherent in its creation fails (i.e., calling
+// this will aslo validate the array). This requires that array_xptr
+// has a schema attached. The ArrowArrayView is an augmented structure
+// provided by the nanoarrow C library that makes it easier to access
+// elements and buffers. This is not currently exposed at the R
+// level but is used at the C level to make validation and conversion
+// to R easier to write.
+SEXP array_view_xptr_from_array_xptr(SEXP array_xptr);
+
+// Returns the struct ArrowArrayView underlying an external pointer,
+// erroring for invalid objects and NULL pointers.
+static inline struct ArrowArrayView* array_view_from_xptr(SEXP
array_view_xptr) {
+ if (!Rf_inherits(array_view_xptr, "nanoarrow_array_view")) {
+ Rf_error("`array_view` argument that is not a nanoarrow_array_view()");
+ }
+
+ struct ArrowArrayView* array_view =
+ (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+ if (array_view == NULL) {
+ Rf_error("nanoarrow_array_view() is an external pointer to NULL");
+ }
+
+ return array_view;
+}
+
+#endif
diff --git a/r/src/init.c b/r/src/init.c
index 107853d..c48daa0 100644
--- a/r/src/init.c
+++ b/r/src/init.c
@@ -19,12 +19,20 @@
#include <R.h>
#include <Rinternals.h>
+#include "altrep.h"
+
/* generated by tools/make-callentries.R */
+extern SEXP nanoarrow_c_make_altrep_chr(SEXP array_view_xptr);
+extern SEXP nanoarrow_c_is_altrep(SEXP x_sexp);
+extern SEXP nanoarrow_c_altrep_is_materialized(SEXP x_sexp);
+extern SEXP nanoarrow_c_altrep_force_materialize(SEXP x_sexp, SEXP
recursive_sexp);
+extern SEXP nanoarrow_c_infer_ptype(SEXP array_xptr);
+extern SEXP nanoarrow_c_from_array(SEXP array_xptr, SEXP ptype_sexp);
extern SEXP nanoarrow_c_array_stream_get_schema(SEXP array_stream_xptr);
extern SEXP nanoarrow_c_array_stream_get_next(SEXP array_stream_xptr);
+extern SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr);
extern SEXP nanoarrow_c_array_set_schema(SEXP array_xptr, SEXP schema_xptr,
SEXP validate_sexp);
extern SEXP nanoarrow_c_infer_schema_array(SEXP array_xptr);
-extern SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr);
extern SEXP nanoarrow_c_array_proxy(SEXP array_xptr, SEXP array_view_xptr,
SEXP recursive_sexp);
extern SEXP nanoarrow_c_buffer_info(SEXP buffer_xptr);
extern SEXP nanoarrow_c_buffer_as_raw(SEXP buffer_xptr);
@@ -45,11 +53,17 @@ extern SEXP nanoarrow_c_export_array(SEXP array_xptr, SEXP
ptr_dst);
extern SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr);
static const R_CallMethodDef CallEntries[] = {
+ {"nanoarrow_c_make_altrep_chr", (DL_FUNC)&nanoarrow_c_make_altrep_chr, 1},
+ {"nanoarrow_c_is_altrep", (DL_FUNC)&nanoarrow_c_is_altrep, 1},
+ {"nanoarrow_c_altrep_is_materialized",
(DL_FUNC)&nanoarrow_c_altrep_is_materialized, 1},
+ {"nanoarrow_c_altrep_force_materialize",
(DL_FUNC)&nanoarrow_c_altrep_force_materialize, 2},
+ {"nanoarrow_c_infer_ptype", (DL_FUNC)&nanoarrow_c_infer_ptype, 1},
+ {"nanoarrow_c_from_array", (DL_FUNC)&nanoarrow_c_from_array, 2},
{"nanoarrow_c_array_stream_get_schema",
(DL_FUNC)&nanoarrow_c_array_stream_get_schema, 1},
{"nanoarrow_c_array_stream_get_next",
(DL_FUNC)&nanoarrow_c_array_stream_get_next, 1},
+ {"nanoarrow_c_array_view", (DL_FUNC)&nanoarrow_c_array_view, 2},
{"nanoarrow_c_array_set_schema", (DL_FUNC)&nanoarrow_c_array_set_schema,
3},
{"nanoarrow_c_infer_schema_array",
(DL_FUNC)&nanoarrow_c_infer_schema_array, 1},
- {"nanoarrow_c_array_view", (DL_FUNC)&nanoarrow_c_array_view, 2},
{"nanoarrow_c_array_proxy", (DL_FUNC)&nanoarrow_c_array_proxy, 3},
{"nanoarrow_c_buffer_info", (DL_FUNC)&nanoarrow_c_buffer_info, 1},
{"nanoarrow_c_buffer_as_raw", (DL_FUNC)&nanoarrow_c_buffer_as_raw, 1},
@@ -74,4 +88,6 @@ static const R_CallMethodDef CallEntries[] = {
void R_init_nanoarrow(DllInfo* dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
+
+ register_nanoarrow_altrep(dll);
}
diff --git a/r/src/materialize.c b/r/src/materialize.c
new file mode 100644
index 0000000..bdd7cbc
--- /dev/null
+++ b/r/src/materialize.c
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+// Note: These conversions are not currently written for safety rather than
+// speed. We could make use of C++ templating to provide faster and/or more
+// readable conversions here with a C entry point.
+
+SEXP nanoarrow_materialize_lgl(struct ArrowArrayView* array_view) {
+ SEXP result_sexp = PROTECT(Rf_allocVector(LGLSXP,
array_view->array->length));
+ int* result = LOGICAL(result_sexp);
+
+ // True for all the types supported here
+ const uint8_t* is_valid = array_view->buffer_views[0].data.as_uint8;
+ const uint8_t* data_buffer = array_view->buffer_views[1].data.as_uint8;
+
+ // Fill the buffer
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_BOOL:
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ result[i] = ArrowBitGet(data_buffer, i);
+ }
+
+ // Set any nulls to NA_LOGICAL
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowBitGet(is_valid, i)) {
+ result[i] = NA_LOGICAL;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ case NANOARROW_TYPE_INT32:
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ case NANOARROW_TYPE_DOUBLE:
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ result[i] = ArrowArrayViewGetIntUnsafe(array_view, i) != 0;
+ }
+
+ // Set any nulls to NA_LOGICAL
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowBitGet(is_valid, i)) {
+ result[i] = NA_LOGICAL;
+ }
+ }
+ }
+ break;
+
+ default:
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ UNPROTECT(1);
+ return result_sexp;
+}
+
+SEXP nanoarrow_materialize_int(struct ArrowArrayView* array_view) {
+ SEXP result_sexp = PROTECT(Rf_allocVector(INTSXP,
array_view->array->length));
+ int* result = INTEGER(result_sexp);
+ int64_t n_bad_values = 0;
+
+ // True for all the types supported here
+ const uint8_t* is_valid = array_view->buffer_views[0].data.as_uint8;
+
+ // Fill the buffer
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_INT32:
+ memcpy(result,
+ array_view->buffer_views[1].data.as_int32 +
array_view->array->offset,
+ array_view->array->length * sizeof(int32_t));
+
+ // Set any nulls to NA_INTEGER
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowBitGet(is_valid, i)) {
+ result[i] = NA_INTEGER;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_BOOL:
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ // No need to bounds check for these types
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ result[i] = ArrowArrayViewGetIntUnsafe(array_view, i);
+ }
+
+ // Set any nulls to NA_INTEGER
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowBitGet(is_valid, i)) {
+ result[i] = NA_INTEGER;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ case NANOARROW_TYPE_DOUBLE:
+ // Loop + bounds check. Because we don't know what memory might be
+ // in a null slot, we have to check nulls if there are any.
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (ArrowBitGet(is_valid, i)) {
+ int64_t value = ArrowArrayViewGetIntUnsafe(array_view, i);
+ if (value > INT_MAX || value <= NA_INTEGER) {
+ result[i] = NA_INTEGER;
+ n_bad_values++;
+ } else {
+ result[i] = value;
+ }
+ } else {
+ result[i] = NA_INTEGER;
+ }
+ }
+ } else {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ int64_t value = ArrowArrayViewGetIntUnsafe(array_view, i);
+ if (value > INT_MAX || value <= NA_INTEGER) {
+ result[i] = NA_INTEGER;
+ n_bad_values++;
+ } else {
+ result[i] = value;
+ }
+ }
+ }
+ break;
+
+ default:
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ if (n_bad_values > 0) {
+ Rf_warning("%ld value(s) outside integer range set to NA",
(long)n_bad_values);
+ }
+
+ UNPROTECT(1);
+ return result_sexp;
+}
+
+SEXP nanoarrow_materialize_dbl(struct ArrowArrayView* array_view) {
+ SEXP result_sexp = PROTECT(Rf_allocVector(REALSXP,
array_view->array->length));
+ double* result = REAL(result_sexp);
+
+ // True for all the types supported here
+ const uint8_t* is_valid = array_view->buffer_views[0].data.as_uint8;
+
+ // Fill the buffer
+ switch (array_view->storage_type) {
+ case NANOARROW_TYPE_DOUBLE:
+ memcpy(result,
+ array_view->buffer_views[1].data.as_double +
array_view->array->offset,
+ array_view->array->length * sizeof(double));
+
+ // Set any nulls to NA_REAL
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowBitGet(is_valid, i)) {
+ result[i] = NA_REAL;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_BOOL:
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ case NANOARROW_TYPE_INT32:
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ // TODO: implement bounds check for int64 and uint64, but instead
+ // of setting to NA, just warn (because sequential values might not
+ // roundtrip above 2^51 ish)
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ result[i] = ArrowArrayViewGetDoubleUnsafe(array_view, i);
+ }
+
+ // Set any nulls to NA_REAL
+ if (is_valid != NULL && array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowBitGet(is_valid, i)) {
+ result[i] = NA_REAL;
+ }
+ }
+ }
+ break;
+
+ default:
+ UNPROTECT(1);
+ return R_NilValue;
+ }
+
+ UNPROTECT(1);
+ return result_sexp;
+}
+
+SEXP nanoarrow_materialize_chr(struct ArrowArrayView* array_view) {
+ SEXP result_sexp = PROTECT(Rf_allocVector(STRSXP,
array_view->array->length));
+
+ struct ArrowStringView item;
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (ArrowArrayViewIsNull(array_view, i)) {
+ SET_STRING_ELT(result_sexp, i, NA_STRING);
+ } else {
+ item = ArrowArrayViewGetStringUnsafe(array_view, i);
+ SET_STRING_ELT(result_sexp, i, Rf_mkCharLenCE(item.data, item.n_bytes,
CE_UTF8));
+ }
+ }
+
+ UNPROTECT(1);
+ return result_sexp;
+}
+
+SEXP nanoarrow_materialize_list_of_raw(struct ArrowArrayView* array_view) {
+ switch(array_view->storage_type) {
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_LARGE_STRING:
+ case NANOARROW_TYPE_BINARY:
+ case NANOARROW_TYPE_LARGE_BINARY:
+ break;
+ default:
+ return R_NilValue;
+ }
+
+ SEXP result_sexp = PROTECT(Rf_allocVector(VECSXP,
array_view->array->length));
+
+ struct ArrowBufferView item;
+ SEXP item_sexp;
+ for (R_xlen_t i = 0; i < array_view->array->length; i++) {
+ if (!ArrowArrayViewIsNull(array_view, i)) {
+ item = ArrowArrayViewGetBytesUnsafe(array_view, i);
+ item_sexp = PROTECT(Rf_allocVector(RAWSXP, item.n_bytes));
+ memcpy(RAW(item_sexp), item.data.data, item.n_bytes);
+ SET_VECTOR_ELT(result_sexp, i, item_sexp);
+ UNPROTECT(1);
+ }
+ }
+
+ UNPROTECT(1);
+ return result_sexp;
+}
diff --git a/r/src/materialize.h b/r/src/materialize.h
new file mode 100644
index 0000000..c9aac71
--- /dev/null
+++ b/r/src/materialize.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_H_INCLUDED
+#define R_MATERIALIZE_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+// These functions materialize a complete R vector or return R_NilValue
+// if they cannot (i.e., no conversion possible). These functions will warn
+// (once) if there are values that cannot be converted (e.g., because they
+// are out of range).
+SEXP nanoarrow_materialize_lgl(struct ArrowArrayView* array_view);
+SEXP nanoarrow_materialize_int(struct ArrowArrayView* array_view);
+SEXP nanoarrow_materialize_dbl(struct ArrowArrayView* array_view);
+SEXP nanoarrow_materialize_chr(struct ArrowArrayView* array_view);
+SEXP nanoarrow_materialize_list_of_raw(struct ArrowArrayView* array_view);
+
+#endif
diff --git a/r/src/pointers-cpp.cc b/r/src/pointers_cpp.cc
similarity index 100%
rename from r/src/pointers-cpp.cc
rename to r/src/pointers_cpp.cc
diff --git a/r/src/schema.c b/r/src/schema.c
index 58cf995..d9d4b2f 100644
--- a/r/src/schema.c
+++ b/r/src/schema.c
@@ -68,6 +68,11 @@ static SEXP borrow_schema_xptr(struct ArrowSchema* schema,
SEXP shelter) {
return schema_xptr;
}
+SEXP borrow_schema_child_xptr(SEXP schema_xptr, int64_t i) {
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+ return borrow_schema_xptr(schema->children[i], schema_xptr);
+}
+
SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr) {
struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
diff --git a/r/src/schema.h b/r/src/schema.h
index f0fc406..9d38ef2 100644
--- a/r/src/schema.h
+++ b/r/src/schema.h
@@ -25,6 +25,13 @@
void finalize_schema_xptr(SEXP schema_xptr);
+// Returns an external pointer to a schema child. The returned pointer will
keep its
+// parent alive: this is typically what you want when printing or performing a
conversion,
+// where the borrowed external pointer is ephemeral.
+SEXP borrow_schema_child_xptr(SEXP schema_xptr, int64_t i);
+
+// Returns the underlying struct ArrowSchema* from an external pointer,
+// checking and erroring for invalid objects, pointers, and arrays.
static inline struct ArrowSchema* schema_from_xptr(SEXP schema_xptr) {
if (!Rf_inherits(schema_xptr, "nanoarrow_schema")) {
Rf_error("`schema` argument that does not inherit from
'nanoarrow_schema'");
@@ -42,6 +49,9 @@ static inline struct ArrowSchema* schema_from_xptr(SEXP
schema_xptr) {
return schema;
}
+// Returns the underlying struct ArrowSchema* from an external pointer,
+// checking and erroring for invalid objects, pointers, and arrays, but
+// allowing for R_NilValue to signify a NULL return.
static inline struct ArrowSchema* nullable_schema_from_xptr(SEXP schema_xptr) {
if (schema_xptr == R_NilValue) {
return NULL;
@@ -50,6 +60,8 @@ static inline struct ArrowSchema*
nullable_schema_from_xptr(SEXP schema_xptr) {
}
}
+// Create an external pointer with the proper class and that will release any
+// non-null, non-released pointer when garbage collected.
static inline SEXP schema_owning_xptr() {
struct ArrowSchema* schema =
(struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema));
diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R
new file mode 100644
index 0000000..f9f9ea9
--- /dev/null
+++ b/r/tests/testthat/test-altrep.R
@@ -0,0 +1,82 @@
+
+test_that("nanoarrow_altrep_chr() returns NULL for unsupported types", {
+ expect_null(nanoarrow_altrep_chr(as_nanoarrow_array(1:10)))
+ expect_null(nanoarrow_altrep_chr(as_nanoarrow_array(1:10)))
+})
+
+test_that("nanoarrow_altrep_chr() works for string", {
+ x <- as_nanoarrow_array(c(NA, letters), schema = arrow::utf8())
+ x_altrep <- nanoarrow_altrep_chr(x)
+
+ expect_output(.Internal(inspect(x_altrep)),
"<nanoarrow::altrep_chr\\[27\\]>")
+
+ # Check that some common operations that call the string elt method
+ # don't materialize the vector
+ expect_identical(x_altrep, c(NA, letters))
+ expect_length(x_altrep, 27)
+ expect_false(is_nanoarrow_altrep_materialized(x_altrep))
+
+ # Setting an element will materialize, duplicate, then modify
+ x_altrep2 <- x_altrep
+ x_altrep2[1] <- "not a letter"
+ expect_identical(x_altrep2, c("not a letter", letters))
+ expect_true(is_nanoarrow_altrep_materialized(x_altrep))
+
+ # Check the same operations on the materialized output
+ expect_identical(x_altrep, c(NA, letters))
+ expect_length(x_altrep, 27)
+
+ # Materialization should get printed in inspect()
+ expect_output(.Internal(inspect(x_altrep)), "<materialized
nanoarrow::altrep_chr\\[27\\]>")
+
+ # For good measure, force materialization again and check
+ nanoarrow_altrep_force_materialize(x_altrep)
+ expect_identical(x_altrep, c(NA, letters))
+ expect_length(x_altrep, 27)
+})
+
+test_that("nanoarrow_altrep_chr() works for large string", {
+ x <- as_nanoarrow_array(letters, schema = arrow::large_utf8())
+ x_altrep <- nanoarrow_altrep_chr(x)
+ expect_identical(x_altrep, letters)
+})
+
+test_that("is_nanoarrow_altrep() returns true for nanoarrow altrep objects", {
+ expect_false(is_nanoarrow_altrep("not altrep"))
+ expect_false(is_nanoarrow_altrep(1:10))
+
expect_true(is_nanoarrow_altrep(nanoarrow_altrep_chr(as_nanoarrow_array("whee"))))
+})
+
+test_that("nanoarrow_altrep_chr_force_materialize() forces materialization", {
+ x <- as_nanoarrow_array(letters, schema = arrow::utf8())
+ x_altrep <- nanoarrow_altrep_chr(x)
+
+ expect_identical(nanoarrow_altrep_force_materialize("not altrep"), 0L)
+ expect_identical(nanoarrow_altrep_force_materialize(x_altrep), 1L)
+
+ x <- as_nanoarrow_array(letters, schema = arrow::utf8())
+ x_altrep_df <- data.frame(x = nanoarrow_altrep_chr(x))
+ expect_identical(
+ nanoarrow_altrep_force_materialize(x_altrep_df, recursive = FALSE),
+ 0L
+ )
+ expect_identical(
+ nanoarrow_altrep_force_materialize(x_altrep_df, recursive = TRUE),
+ 1L
+ )
+ expect_identical(
+ nanoarrow_altrep_force_materialize(x_altrep_df, recursive = TRUE),
+ 0L
+ )
+})
+
+test_that("is_nanoarrow_altrep_materialized() checks for materialization", {
+ expect_identical(is_nanoarrow_altrep_materialized("not altrep"), NA)
+ expect_identical(is_nanoarrow_altrep_materialized(1:10), NA)
+
+ x <- as_nanoarrow_array(letters, schema = arrow::utf8())
+ x_altrep <- nanoarrow_altrep_chr(x)
+ expect_false(is_nanoarrow_altrep_materialized(x_altrep))
+ expect_identical(nanoarrow_altrep_force_materialize(x_altrep), 1L)
+ expect_true(is_nanoarrow_altrep_materialized(x_altrep))
+})
diff --git a/r/tests/testthat/test-array-convert-vector.R
b/r/tests/testthat/test-array-convert-vector.R
new file mode 100644
index 0000000..6e40a03
--- /dev/null
+++ b/r/tests/testthat/test-array-convert-vector.R
@@ -0,0 +1,375 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("infer_nanoarrow_ptype() works for basic types", {
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(logical())),
+ logical()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(integer())),
+ integer()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(double())),
+ double()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(character())),
+ character()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(data.frame(x = character()))),
+ data.frame(x = character())
+ )
+})
+
+test_that("infer_nanoarrow_ptype() errors for types it can't infer", {
+ unsupported_array <- arrow::concat_arrays(type = arrow::decimal256(3, 4))
+ expect_error(
+ infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_array)),
+ "Can't infer R vector type for array <d:3,4,256>"
+ )
+
+ unsupported_struct <- arrow::concat_arrays(
+ type = arrow::struct(col = arrow::decimal256(3, 4))
+ )
+ expect_error(
+ infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_struct)),
+ "Can't infer R vector type for `col` <d:3,4,256>"
+ )
+})
+
+test_that("from_nanoarrow_array() errors for invalid arrays", {
+ array <- as_nanoarrow_array(1:10)
+ nanoarrow_array_set_schema(
+ array,
+ infer_nanoarrow_schema("chr"),
+ validate = FALSE
+ )
+
+ expect_error(
+ from_nanoarrow_array(array),
+ "Expected array with 3 buffer"
+ )
+})
+
+test_that("from_nanoarrow_array() errors for unsupported ptype", {
+ array <- as_nanoarrow_array(1:10)
+
+ # an S3 unsupported type
+ expect_error(
+ from_nanoarrow_array(array, structure(list(), class = "some_class")),
+ "Can't convert array <i> to R vector of type some_class"
+ )
+
+ # A non-S3 unsupported type
+ expect_error(
+ from_nanoarrow_array(array, environment()),
+ "Can't convert array <i> to R vector of type environment"
+ )
+})
+
+test_that("from_nanoarrow_array() errors for unsupported array", {
+ unsupported_array <- arrow::concat_arrays(type = arrow::decimal256(3, 4))
+ expect_error(
+ from_nanoarrow_array(as_nanoarrow_array(unsupported_array)),
+ "Can't infer R vector type for array <d:3,4,256>"
+ )
+})
+
+test_that("convert to vector works for data.frame", {
+ df <- data.frame(a = 1L, b = "two", c = 3, d = TRUE)
+ array <- as_nanoarrow_array(df)
+
+ expect_identical(from_nanoarrow_array(array, NULL), df)
+ expect_identical(from_nanoarrow_array(array, df), df)
+
+ expect_error(
+ from_nanoarrow_array(array, data.frame(a = integer(), b = raw())),
+ "Expected data.frame\\(\\) ptype with 4 column\\(s\\) but found 2
column\\(s\\)"
+ )
+
+ bad_ptype <- data.frame(a = integer(), b = raw(), c = double(), d =
integer())
+ expect_error(
+ from_nanoarrow_array(array, bad_ptype),
+ "Can't convert `b` <u> to R vector of type raw"
+ )
+})
+
+test_that("convert to vector works for partial_frame", {
+ array <- as_nanoarrow_array(data.frame(a = 1L, b = "two"))
+ expect_identical(
+ from_nanoarrow_array(array, vctrs::partial_frame()),
+ data.frame(a = 1L, b = "two")
+ )
+})
+
+test_that("convert to vector works for tibble", {
+ array <- as_nanoarrow_array(data.frame(a = 1L, b = "two"))
+ expect_identical(
+ from_nanoarrow_array(array, tibble::tibble(a = integer(), b =
character())),
+ tibble::tibble(a = 1L, b = "two")
+ )
+
+ # Check nested tibble at both levels
+ tbl_nested_df <- tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3))
+ array_nested <- as_nanoarrow_array(tbl_nested_df)
+
+ expect_identical(
+ from_nanoarrow_array(array_nested, tbl_nested_df),
+ tbl_nested_df
+ )
+
+ df_nested_tbl <- as.data.frame(tbl_nested_df)
+ df_nested_tbl$c <- tibble::as_tibble(df_nested_tbl$c)
+ expect_identical(
+ from_nanoarrow_array(array_nested, df_nested_tbl),
+ df_nested_tbl
+ )
+})
+
+test_that("convert to vector works for valid logical()", {
+ arrow_numeric_types <- list(
+ int8 = arrow::int8(),
+ uint8 = arrow::uint8(),
+ int16 = arrow::int16(),
+ uint16 = arrow::uint16(),
+ int32 = arrow::int32(),
+ uint32 = arrow::uint32(),
+ int64 = arrow::int64(),
+ uint64 = arrow::uint64(),
+ float32 = arrow::float32(),
+ float64 = arrow::float64()
+ )
+
+ vals <- c(NA, 0:10)
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]),
+ logical()
+ ),
+ vals != 0
+ )
+ }
+
+ vals_no_na <- 0:10
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]),
+ logical()
+ ),
+ vals_no_na != 0
+ )
+ }
+
+ # Boolean array to logical
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
+ logical()
+ ),
+ c(NA, TRUE, FALSE)
+ )
+
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
+ logical()
+ ),
+ c(TRUE, FALSE)
+ )
+})
+
+test_that("convert to vector errors for bad array to logical()", {
+ expect_error(
+ from_nanoarrow_array(as_nanoarrow_array(letters), logical()),
+ "Can't convert array <u> to R vector of type logical"
+ )
+})
+
+test_that("convert to vector works for valid integer()", {
+ arrow_int_types <- list(
+ int8 = arrow::int8(),
+ uint8 = arrow::uint8(),
+ int16 = arrow::int16(),
+ uint16 = arrow::uint16(),
+ int32 = arrow::int32(),
+ uint32 = arrow::uint32(),
+ int64 = arrow::int64(),
+ uint64 = arrow::uint64(),
+ float32 = arrow::float32(),
+ float64 = arrow::float64()
+ )
+
+ ints <- c(NA, 0:10)
+ for (nm in names(arrow_int_types)) {
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(ints, schema = arrow_int_types[[!!nm]]),
+ integer()
+ ),
+ ints
+ )
+ }
+
+ ints_no_na <- 0:10
+ for (nm in names(arrow_int_types)) {
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(ints_no_na, schema = arrow_int_types[[!!nm]]),
+ integer()
+ ),
+ ints_no_na
+ )
+ }
+
+ # Boolean array to integer
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
+ integer()
+ ),
+ c(NA, 1L, 0L)
+ )
+
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
+ integer()
+ ),
+ c(1L, 0L)
+ )
+})
+
+test_that("convert to vector warns for invalid integer()", {
+ array <- as_nanoarrow_array(arrow::as_arrow_array(.Machine$double.xmax))
+ expect_warning(
+ expect_identical(from_nanoarrow_array(array, integer()), NA_integer_),
+ "1 value\\(s\\) outside integer range set to NA"
+ )
+
+ array <- as_nanoarrow_array(arrow::as_arrow_array(c(NA,
.Machine$double.xmax)))
+ expect_warning(
+ expect_identical(from_nanoarrow_array(array, integer()), c(NA_integer_,
NA_integer_)),
+ "1 value\\(s\\) outside integer range set to NA"
+ )
+})
+
+test_that("convert to vector errors for bad array to integer()", {
+ expect_error(
+ from_nanoarrow_array(as_nanoarrow_array(letters), integer()),
+ "Can't convert array <u> to R vector of type integer"
+ )
+})
+
+test_that("convert to vector works for valid double()", {
+ arrow_numeric_types <- list(
+ int8 = arrow::int8(),
+ uint8 = arrow::uint8(),
+ int16 = arrow::int16(),
+ uint16 = arrow::uint16(),
+ int32 = arrow::int32(),
+ uint32 = arrow::uint32(),
+ int64 = arrow::int64(),
+ uint64 = arrow::uint64(),
+ float32 = arrow::float32(),
+ float64 = arrow::float64()
+ )
+
+ vals <- as.double(c(NA, 0:10))
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]),
+ double()
+ ),
+ vals
+ )
+ }
+
+ vals_no_na <- as.double(0:10)
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]),
+ double()
+ ),
+ vals_no_na
+ )
+ }
+
+ # Boolean array to double
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
+ double()
+ ),
+ as.double(c(NA, 1L, 0L))
+ )
+
+ expect_identical(
+ from_nanoarrow_array(
+ as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
+ double()
+ ),
+ as.double(c(1L, 0L))
+ )
+})
+
+test_that("convert to vector errors for bad array to double()", {
+ expect_error(
+ from_nanoarrow_array(as_nanoarrow_array(letters), double()),
+ "Can't convert array <u> to R vector of type numeric"
+ )
+})
+
+test_that("convert to vector works for character()", {
+ array <- as_nanoarrow_array(letters)
+ expect_identical(
+ from_nanoarrow_array(array, character()),
+ letters
+ )
+
+ # make sure we get altrep here
+ expect_true(is_nanoarrow_altrep(from_nanoarrow_array(array, character())))
+
+ # check an array that we can't convert
+ expect_error(
+ from_nanoarrow_array(as_nanoarrow_array(1:5), character()),
+ "Can't convert array <i> to R vector of type character"
+ )
+})
+
+test_that("convert to vector works for character()", {
+ array <- as_nanoarrow_array(list(as.raw(1:5)), schema = arrow::binary())
+ expect_identical(
+ from_nanoarrow_array(array),
+ list(as.raw(1:5))
+ )
+
+ expect_identical(
+ from_nanoarrow_array(array, list()),
+ list(as.raw(1:5))
+ )
+})
diff --git a/r/tests/testthat/test-array.R b/r/tests/testthat/test-array.R
index 48aaef0..9039134 100644
--- a/r/tests/testthat/test-array.R
+++ b/r/tests/testthat/test-array.R
@@ -41,7 +41,7 @@ test_that("as_nanoarrow_array() / from_nanoarrow_array()
default method works",
array <- as_nanoarrow_array(1:10)
expect_identical(from_nanoarrow_array(array), 1:10)
- array <- as_nanoarrow_array(1:10, schema = arrow::float64())
+ array <- as_nanoarrow_array(as.double(1:10), schema = arrow::float64())
expect_identical(from_nanoarrow_array(array), as.double(1:10))
})
@@ -69,6 +69,15 @@ test_that("as.vector() and as.data.frame() work for array", {
struct_array <- as_nanoarrow_array(data.frame(a = 1:10))
expect_identical(as.data.frame(struct_array), data.frame(a = 1:10))
+ expect_error(
+ as.data.frame(array),
+ "Can't convert array with schema 'i' to data.frame"
+ )
+})
+
+test_that("as_tibble() works for array()", {
+ struct_array <- as_nanoarrow_array(data.frame(a = 1:10))
+ expect_identical(tibble::as_tibble(struct_array), tibble::tibble(a = 1:10))
})
test_that("schemaless array list interface works for non-nested types", {
diff --git a/r/tests/testthat/test-pkg-arrow.R
b/r/tests/testthat/test-pkg-arrow.R
index 935181e..da7f9fd 100644
--- a/r/tests/testthat/test-pkg-arrow.R
+++ b/r/tests/testthat/test-pkg-arrow.R
@@ -47,6 +47,49 @@ test_that("nanoarrow_array to Array works", {
expect_true(chr$Equals(arrow::Array$create(c("one", "two"))))
})
+test_that("nanoarrow_array to Array works for child arrays", {
+ skip_if_not_installed("arrow")
+
+ df <- data.frame(a = 1, b = "two")
+ batch <- as_nanoarrow_array(df)
+
+ # This type of export is special because batch$children[[2]] has an SEXP
+ # dependency on the original array. When we export it, we reverse that
+ # dependency such that the exported array and the batch->children[1] array
+ # are shells that call R_ReleaseObject on a common object (i.e., sort of like
+ # a shared pointer).
+ array_from_column <- arrow::as_arrow_array(batch$children[[2]])
+
+ # The exported array should be valid
+ expect_null(array_from_column$Validate())
+
+ # All the nanoarrow pointers should still be valid
+ expect_true(nanoarrow_pointer_is_valid(batch))
+ expect_true(nanoarrow_pointer_is_valid(batch$children[[1]]))
+ expect_true(nanoarrow_pointer_is_valid(batch$children[[2]]))
+
+ # Let the exported arrow::Array go out of scope and maximize the
+ # chance that the exported data release callback is called
+ array_from_column <- NULL
+ gc()
+ Sys.sleep(0.1)
+
+ # All the nanoarrow pointers should *still* be valid even after that
+ # release callback is called
+ expect_true(nanoarrow_pointer_is_valid(batch))
+ expect_true(nanoarrow_pointer_is_valid(batch$children[[1]]))
+ expect_true(nanoarrow_pointer_is_valid(batch$children[[2]]))
+
+ # Export one column again but this time let the `batch` go out of scope
+ array_from_column <- arrow::as_arrow_array(batch$children[[1]])
+ batch <- NULL
+ gc()
+ Sys.sleep(0.1)
+
+ # The exported array should still be valid
+ expect_null(array_from_column$Validate())
+})
+
test_that("Array to nanoarrow_array works", {
skip_if_not_installed("arrow")
diff --git a/r/tests/testthat/test-util.R b/r/tests/testthat/test-util.R
index b899d00..e0678ec 100644
--- a/r/tests/testthat/test-util.R
+++ b/r/tests/testthat/test-util.R
@@ -21,3 +21,22 @@ test_that("new_data_frame() works", {
data.frame(x = 1, y = 2)
)
})
+
+test_that("vector fuzzers work", {
+ ptype <- data.frame(a = logical(), b = integer(), c = double(), d =
character())
+ df_gen <- vec_gen(ptype, n = 123)
+
+ expect_identical(nrow(df_gen), 123L)
+ expect_identical(df_gen[integer(), ], ptype)
+
+ expect_error(vec_gen(environment()), "Don't know how to generate vector")
+})
+
+test_that("vector shuffler works", {
+ df <- data.frame(letters = letters)
+ df_shuffled <- vec_shuffle(df)
+ expect_setequal(df_shuffled$letters, df$letters)
+
+ letters_shuffled <- vec_shuffle(letters)
+ expect_setequal(letters_shuffled, letters)
+})
diff --git a/r/tools/make-callentries.R b/r/tools/make-callentries.R
index 89fae66..572f9cf 100644
--- a/r/tools/make-callentries.R
+++ b/r/tools/make-callentries.R
@@ -29,6 +29,7 @@ defs <- tibble(
def = src_sources %>%
str_extract_all(regex("SEXP nanoarrow_c_[^\\)]+\\)", multiline = TRUE)) %>%
unlist() %>%
+ unique() %>%
str_replace_all("\\s+", " ") %>%
str_trim(),
name = def %>% str_extract("nanoarrow_c_[^\\(]+"),