This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new f6104407 fix: Ensure ArrowArrayBuffer() and ArrowArraySetBuffer() work
for variadic buffers (#808)
f6104407 is described below
commit f6104407caf3c169ea76f2d39f0d2b8727995b67
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Sep 29 09:43:53 2025 -0500
fix: Ensure ArrowArrayBuffer() and ArrowArraySetBuffer() work for variadic
buffers (#808)
The high-level goal of this PR is to fix a bug where the R package
couldn't create a shallow copy of a string view array. The R and Python
packages create shallow copies by borrowing buffers from the original
and constructing an array using `ArrowArraySetBuffer()`. Because
`ArrowArraySetBuffer()` errored for any buffer other than 0, 1, and 2,
string view arrays caused cryptic errors when they hit certain places in
the R package.
After this PR, `ArrowArrayBuffer()` and `ArrowArraySetBuffer()` work for
any buffer, variadic or no. This means it's possible to create
string/binary view arrays by buffer, even if it's mostly something we
just do internally.
I'd hoped this would be a more straightforward fix, but the assumption
that there will always be <=3 buffers ran fairly deep.
---
python/src/nanoarrow/_array.pyx | 43 ++++++++------
python/src/nanoarrow/_utils.pyx | 6 ++
python/src/nanoarrow/c_array.py | 4 ++
python/tests/test_c_array.py | 30 ++++++++++
r/R/array.R | 4 ++
r/src/array.c | 25 +++++++-
r/src/array.h | 9 +++
r/src/init.c | 2 +
r/tests/testthat/test-array.R | 40 ++++++++++++-
src/nanoarrow/common/array.c | 115 ++++++++++++++++++++++--------------
src/nanoarrow/common/array_test.cc | 52 ++++++++++++++++
src/nanoarrow/common/inline_array.h | 45 ++++++++++----
src/nanoarrow/common/inline_types.h | 3 -
src/nanoarrow/nanoarrow.h | 6 ++
14 files changed, 303 insertions(+), 81 deletions(-)
diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx
index 869087b5..c01e0c5a 100644
--- a/python/src/nanoarrow/_array.pyx
+++ b/python/src/nanoarrow/_array.pyx
@@ -32,6 +32,7 @@ from cpython cimport (
from nanoarrow_c cimport (
ArrowArray,
+ ArrowArrayAddVariadicBuffers,
ArrowArrayAppendBytes,
ArrowArrayAppendNull,
ArrowArrayAppendString,
@@ -721,6 +722,13 @@ cdef class CArrayBuilder:
self._ptr.null_count = self._ptr.length - count
return self
+ def ensure_buffers(self, n_buffers) -> CArrayBuilder:
+ if self._ptr.n_buffers < n_buffers:
+ code = ArrowArrayAddVariadicBuffers(self._ptr, n_buffers -
self._ptr.n_buffers)
+ Error.raise_error_not_ok("ArrowArrayAddVariadicBuffers() failed",
code)
+
+ return self
+
def set_buffer(self, int64_t i, CBuffer buffer, move=False) ->
CArrayBuilder:
"""Set an ArrowArray buffer
@@ -732,8 +740,8 @@ cdef class CArrayBuilder:
is False (the default), this function will a make a shallow copy via
another
layer of Python object wrapping.
"""
- if i < 0 or i > 3:
- raise IndexError("i must be >= 0 and <= 3")
+ if i < 0 or i >= self._ptr.n_buffers:
+ raise IndexError(f"i must be >= 0 and < {self._ptr.n_buffers}")
if buffer._device != self._device:
raise ValueError(
@@ -749,11 +757,6 @@ cdef class CArrayBuilder:
ArrowBufferMove(buffer._ptr, ArrowArrayBuffer(self._ptr, i))
- # The buffer's lifecycle is now owned by the array; however, we need
- # array->buffers[i] to be updated such that it equals
- # ArrowArrayBuffer(array, i)->data.
- self._ptr.buffers[i] = ArrowArrayBuffer(self._ptr, i).data
-
return self
def set_child(self, int64_t i, CArray c_array, move=False) ->
CArrayBuilder:
@@ -819,20 +822,22 @@ cdef class CArrayBuilder:
cdef Error error = Error()
cdef int code
- if self._can_validate:
- c_validation_level = NANOARROW_VALIDATION_LEVEL_DEFAULT
- if validation_level == "full":
- c_validation_level = NANOARROW_VALIDATION_LEVEL_FULL
- elif validation_level == "minimal":
- c_validation_level = NANOARROW_VALIDATION_LEVEL_MINIMAL
- elif validation_level == "none":
- c_validation_level = NANOARROW_VALIDATION_LEVEL_NONE
+ if not self._can_validate and validation_level not in (None, "none"):
+ raise NotImplementedError("Validation for array with children is
not implemented")
- code = ArrowArrayFinishBuilding(self._ptr, c_validation_level,
&error.c_error)
- error.raise_message_not_ok("ArrowArrayFinishBuildingDefault()",
code)
+ if not self._can_validate:
+ validation_level = "none"
- elif validation_level not in (None, "none"):
- raise NotImplementedError("Validation for array with children is
not implemented")
+ c_validation_level = NANOARROW_VALIDATION_LEVEL_DEFAULT
+ if validation_level == "full":
+ c_validation_level = NANOARROW_VALIDATION_LEVEL_FULL
+ elif validation_level == "minimal":
+ c_validation_level = NANOARROW_VALIDATION_LEVEL_MINIMAL
+ elif validation_level == "none":
+ c_validation_level = NANOARROW_VALIDATION_LEVEL_NONE
+
+ code = ArrowArrayFinishBuilding(self._ptr, c_validation_level,
&error.c_error)
+ error.raise_message_not_ok("ArrowArrayFinishBuildingDefault()", code)
out = self.c_array
self.c_array = CArray.allocate(CSchema.allocate())
diff --git a/python/src/nanoarrow/_utils.pyx b/python/src/nanoarrow/_utils.pyx
index b261fb29..63943b46 100644
--- a/python/src/nanoarrow/_utils.pyx
+++ b/python/src/nanoarrow/_utils.pyx
@@ -32,6 +32,7 @@ from cpython.ref cimport Py_INCREF, Py_DECREF
from nanoarrow_c cimport (
ArrowArray,
+ ArrowArrayAddVariadicBuffers,
ArrowArrayAllocateChildren,
ArrowArrayAllocateDictionary,
ArrowArrayBuffer,
@@ -363,6 +364,11 @@ cdef void c_array_shallow_copy(object base, const
ArrowArray* src, ArrowArray* d
tmp.offset = src.offset
tmp.null_count = src.null_count
+ # We might need some more buffers if we are shallowly copying a
string/binary view
+ if src.n_buffers > 3:
+ code = ArrowArrayAddVariadicBuffers(dst, src.n_buffers - 3)
+ Error.raise_error_not_ok("ArrowArrayAddVariadicBuffers()", code)
+
for i in range(src.n_buffers):
if src.buffers[i] != NULL:
# The purpose of this buffer is soley so that we can use the
diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
index 57390b7c..e2131bf4 100644
--- a/python/src/nanoarrow/c_array.py
+++ b/python/src/nanoarrow/c_array.py
@@ -278,6 +278,10 @@ def c_array_from_buffers(
# of children have been initialized.
builder.init_from_schema(schema)
+ # We might need more buffers if we're constructing a string or binary view
+ buffers = list(buffers)
+ builder.ensure_buffers(len(buffers))
+
# Set buffers, optionally moving ownership of the buffers as well (i.e.,
# the objects in the input buffers would be replaced with an empty
ArrowBuffer)
for i, buffer in enumerate(buffers):
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index ae7b80a4..9959bf03 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -501,6 +501,36 @@ def test_c_array_from_buffers_recursive():
)
+def test_c_array_from_buffers_string_view():
+ # Creating the actual view buffer is hard, but we can at least make sure
the
+ # buffers are able to be roundtripped.
+ c_array = na.c_array_from_buffers(
+ na.string_view(),
+ 0,
+ [
+ None,
+ None,
+ b"banana one",
+ b"banana two",
+ b"banana three",
+ na.c_buffer([10, 10, 12], na.int64()),
+ ],
+ validation_level="none",
+ )
+ assert c_array.length == 0
+ assert c_array.null_count == 0
+ assert c_array.offset == 0
+
+ array_view = c_array.view()
+ assert array_view.storage_type == "string_view"
+ assert bytes(array_view.buffer(0)) == b""
+ assert bytes(array_view.buffer(1)) == b""
+ assert bytes(array_view.buffer(2)) == b"banana one"
+ assert bytes(array_view.buffer(3)) == b"banana two"
+ assert bytes(array_view.buffer(4)) == b"banana three"
+ assert list(array_view.buffer(5)) == [10, 10, 12]
+
+
def test_c_array_from_buffers_validation():
# Should fail with all validation levels except none
for validation_level in ("full", "default", "minimal"):
diff --git a/r/R/array.R b/r/R/array.R
index 5347dc9e..2aabaf09 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -332,6 +332,10 @@ nanoarrow_array_modify <- function(array, new_values,
validate = TRUE) {
)
}
+ # Finish building (e.g., ensure pointers are flushed)
+ .Call(nanoarrow_c_array_finish_building, array_copy)
+
+ # Validate if requested
if (!is.null(schema) && validate) {
array_copy <- .Call(nanoarrow_c_array_validate_after_modify, array_copy,
schema)
}
diff --git a/r/src/array.c b/r/src/array.c
index c4603b11..fb4eef21 100644
--- a/r/src/array.c
+++ b/r/src/array.c
@@ -93,8 +93,8 @@ SEXP nanoarrow_c_array_set_buffers(SEXP array_xptr, SEXP
buffers_sexp) {
struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr);
int64_t n_buffers = Rf_xlength(buffers_sexp);
- if (n_buffers > 3) {
- Rf_error("length(array$buffers) must be <= 3");
+ if (n_buffers != array->n_buffers) {
+ Rf_error("Changing the number of buffers in array_modify is not
supported");
}
// Release any buffers that aren't about to be replaced
@@ -248,6 +248,17 @@ static int move_array_buffers(struct ArrowArray* src,
struct ArrowArray* dst,
return NANOARROW_OK;
}
+SEXP nanoarrow_c_array_finish_building(SEXP array_xptr) {
+ struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr);
+ struct ArrowError error;
+ int result = ArrowArrayFinishBuilding(array,
NANOARROW_VALIDATION_LEVEL_NONE, &error);
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayFinishBuilding(): %s", error.message);
+ }
+
+ return R_NilValue;
+}
+
SEXP nanoarrow_c_array_validate_after_modify(SEXP array_xptr, SEXP
schema_xptr) {
// A very particular type of validation we can do with the ArrowArray we use
// in nanoarrow_array_modify() (which was created using ArrowArrayInit).
@@ -271,6 +282,16 @@ SEXP nanoarrow_c_array_validate_after_modify(SEXP
array_xptr, SEXP schema_xptr)
Rf_error("ArrowArrayInitFromSchema(): %s", error.message);
}
+ // Add any variadic buffers that might be required
+ if ((array->n_buffers > array_dst->n_buffers) &&
+ array->n_buffers > NANOARROW_MAX_FIXED_BUFFERS) {
+ result =
+ ArrowArrayAddVariadicBuffers(array_dst, array->n_buffers -
array_dst->n_buffers);
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayAddVariadicBuffers() failed");
+ }
+ }
+
result = move_array_buffers(array, array_dst, schema, &error);
if (result != NANOARROW_OK) {
Rf_error("move_array_buffers: %s", error.message);
diff --git a/r/src/array.h b/r/src/array.h
index ebe965eb..96c2849a 100644
--- a/r/src/array.h
+++ b/r/src/array.h
@@ -94,6 +94,15 @@ static inline void array_export(SEXP array_xptr, struct
ArrowArray* array_copy)
array_copy->offset = array->offset;
// Get buffer references, each of which preserve a reference to
independent_array_xptr
+
+ // We might need some more buffers to be allocated for string views
+ if (array->n_buffers > 3) {
+ result = ArrowArrayAddVariadicBuffers(array_copy, array->n_buffers - 3);
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayAddVariadicBuffers() failed");
+ }
+ }
+
array_copy->n_buffers = array->n_buffers;
for (int64_t i = 0; i < array->n_buffers; i++) {
SEXP borrowed_buffer =
diff --git a/r/src/init.c b/r/src/init.c
index e3288dd4..4d2109a2 100644
--- a/r/src/init.c
+++ b/r/src/init.c
@@ -40,6 +40,7 @@ extern SEXP nanoarrow_c_array_set_offset(SEXP array_xptr,
SEXP offset_sexp);
extern SEXP nanoarrow_c_array_set_buffers(SEXP array_xptr, SEXP buffers_sexp);
extern SEXP nanoarrow_c_array_set_children(SEXP array_xptr, SEXP
children_sexp);
extern SEXP nanoarrow_c_array_set_dictionary(SEXP array_xptr, SEXP
dictionary_xptr);
+extern SEXP nanoarrow_c_array_finish_building(SEXP array_xptr);
extern SEXP nanoarrow_c_array_validate_after_modify(SEXP array_xptr, SEXP
schema_xptr);
extern SEXP nanoarrow_c_array_set_schema(SEXP array_xptr, SEXP schema_xptr,
SEXP validate_sexp);
@@ -121,6 +122,7 @@ static const R_CallMethodDef CallEntries[] = {
{"nanoarrow_c_array_set_buffers", (DL_FUNC)&nanoarrow_c_array_set_buffers,
2},
{"nanoarrow_c_array_set_children",
(DL_FUNC)&nanoarrow_c_array_set_children, 2},
{"nanoarrow_c_array_set_dictionary",
(DL_FUNC)&nanoarrow_c_array_set_dictionary, 2},
+ {"nanoarrow_c_array_finish_building",
(DL_FUNC)&nanoarrow_c_array_finish_building, 1},
{"nanoarrow_c_array_validate_after_modify",
(DL_FUNC)&nanoarrow_c_array_validate_after_modify, 2},
{"nanoarrow_c_array_set_schema", (DL_FUNC)&nanoarrow_c_array_set_schema,
3},
diff --git a/r/tests/testthat/test-array.R b/r/tests/testthat/test-array.R
index ae737aed..e2169d19 100644
--- a/r/tests/testthat/test-array.R
+++ b/r/tests/testthat/test-array.R
@@ -342,16 +342,52 @@ test_that("array modify can modify buffers", {
array <- as_nanoarrow_array(1:5)
expect_error(
nanoarrow_array_modify(array, list(buffers = rep(list(NULL), 4))),
- "must be <= 3"
+ "Changing the number of buffers in array_modify is not supported"
)
# Check that specifying too few buffers will result in a validation error
expect_error(
nanoarrow_array_modify(array, list(buffers = list()), validate = TRUE),
- "Expected 2 buffer"
+ "Changing the number of buffers in array_modify is not supported"
)
})
+test_that("array modify can modify variadic buffers", {
+ # Create a string view array with >1 variadic buffers. The default
+ # internal threshold for splitting up internal buffers is ~30kb.
+ boring_strings <- vapply(
+ 1:10000,
+ function(i) paste0("boring string", i),
+ character(1)
+ )
+
+ array <- as_nanoarrow_array(boring_strings, schema = na_string_view())
+ expect_identical(length(array$buffers), 9L)
+ variadic_sizes <- convert_buffer(array$buffers[[9]])
+ expect_identical(variadic_sizes, c(32757, 32759, 32759, 32759, 32759, 5101))
+ expect_identical(convert_array(array), boring_strings)
+
+ # Save the original array
+ original_array <- array
+
+ # Modify one of the array buffers
+ cool_string_bytes <- charToRaw("cooool string1")
+ first_buffer <- as.raw(array$buffers[[3]])
+ first_buffer[seq_along(cool_string_bytes)] <- cool_string_bytes
+
+ array$buffers[[3]] <- first_buffer
+
+ cool_strings <- boring_strings
+ cool_strings[1] <- "cooool string1"
+ expect_identical(convert_array(array), cool_strings)
+
+ variadic_sizes <- convert_buffer(array$buffers[[9]])
+ expect_identical(variadic_sizes, c(32757, 32759, 32759, 32759, 32759, 5101))
+
+ # Check that the original was unmodified
+ expect_identical(convert_array(original_array), boring_strings)
+})
+
test_that("array modify checks buffer sizes", {
array <- as_nanoarrow_array(1:5)
expect_error(
diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c
index 0ea48c7a..2f0f824c 100644
--- a/src/nanoarrow/common/array.c
+++ b/src/nanoarrow/common/array.c
@@ -38,7 +38,6 @@ static void ArrowArrayReleaseInternal(struct ArrowArray*
array) {
ArrowBufferReset(&private_data->variadic_buffers[i]);
}
ArrowFree(private_data->variadic_buffers);
- ArrowFree(private_data->variadic_buffer_sizes);
ArrowFree(private_data);
}
@@ -74,6 +73,10 @@ static void ArrowArrayReleaseInternal(struct ArrowArray*
array) {
array->release = NULL;
}
+static int ArrowArrayIsInternal(struct ArrowArray* array) {
+ return array->release == &ArrowArrayReleaseInternal;
+}
+
static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
enum ArrowType storage_type) {
switch (storage_type) {
@@ -170,7 +173,6 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray*
array,
}
private_data->n_variadic_buffers = 0;
private_data->variadic_buffers = NULL;
- private_data->variadic_buffer_sizes = NULL;
private_data->list_view_offset = 0;
array->private_data = private_data;
@@ -333,25 +335,35 @@ ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray*
array, int64_t i,
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- switch (i) {
- case 0:
- ArrowBufferMove(buffer, &private_data->bitmap.buffer);
- private_data->buffer_data[i] = private_data->bitmap.buffer.data;
- break;
- case 1:
- case 2:
- ArrowBufferMove(buffer, &private_data->buffers[i - 1]);
- private_data->buffer_data[i] = private_data->buffers[i - 1].data;
- break;
- default:
- return EINVAL;
+ if (i >= array->n_buffers || i < 0) {
+ return EINVAL;
}
+ // Find the `i`th buffer, release what is currently there, and move the
+ // supplied buffer into that slot.
+ struct ArrowBuffer* dst = ArrowArrayBuffer(array, i);
+ ArrowBufferReset(dst);
+ ArrowBufferMove(buffer, dst);
+
+ // Flush the pointer into array->buffers. In theory clients should call
+ // ArrowArrayFinishBuilding() to flush the pointer values before passing
+ // this array elsewhere; however, in early nanoarrow versions this was not
+ // needed and some code may depend on this being true.
+ private_data->buffer_data[i] = dst->data;
+ array->buffers = private_data->buffer_data;
+
return NANOARROW_OK;
}
static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView*
array_view,
- struct ArrowArray* array) {
+ struct ArrowArray* array,
+ struct ArrowError* error) {
+ if (!ArrowArrayIsInternal(array)) {
+ ArrowErrorSet(error,
+ "Can't initialize internal ArrowArrayView from external
ArrowArray");
+ return EINVAL;
+ }
+
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
@@ -376,7 +388,8 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct
ArrowArrayView* array_v
}
for (int64_t i = 0; i < array->n_children; i++) {
- result = ArrowArrayViewInitFromArray(array_view->children[i],
array->children[i]);
+ result =
+ ArrowArrayViewInitFromArray(array_view->children[i],
array->children[i], error);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
@@ -390,7 +403,8 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct
ArrowArrayView* array_v
return result;
}
- result = ArrowArrayViewInitFromArray(array_view->dictionary,
array->dictionary);
+ result =
+ ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary,
error);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
@@ -403,7 +417,7 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct
ArrowArrayView* array_v
static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array,
struct ArrowArrayView*
array_view) {
// Loop through buffers and reserve the extra space that we know about
- for (int64_t i = 0; i < array->n_buffers; i++) {
+ for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
// Don't reserve on a validity buffer that hasn't been allocated yet
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY &&
ArrowArrayBuffer(array, i)->data == NULL) {
@@ -431,7 +445,7 @@ static ArrowErrorCode ArrowArrayReserveInternal(struct
ArrowArray* array,
ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array,
int64_t additional_size_elements) {
struct ArrowArrayView array_view;
- NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array));
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array,
NULL));
// Calculate theoretical buffer sizes (recursively)
ArrowArrayViewSetLength(&array_view, array->length +
additional_size_elements);
@@ -463,47 +477,62 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct
ArrowArray* array) {
}
for (int64_t i = 0; i < array->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
+ if (ArrowArrayIsInternal(array->children[i])) {
+ NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
+ }
}
- if (array->dictionary != NULL) {
+ if (array->dictionary != NULL && ArrowArrayIsInternal(array->dictionary)) {
NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary));
}
return NANOARROW_OK;
}
-static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) {
+static ArrowErrorCode ArrowArrayFlushInternalPointers(struct ArrowArray*
array) {
+ NANOARROW_DCHECK(ArrowArrayIsInternal(array));
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
- const bool is_binary_view = private_data->storage_type ==
NANOARROW_TYPE_STRING_VIEW ||
- private_data->storage_type ==
NANOARROW_TYPE_BINARY_VIEW;
- const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+ if (array->n_buffers > NANOARROW_MAX_FIXED_BUFFERS) {
+ // If the variadic sizes buffer was not set and there is at least one
variadic
+ // buffer, populate it now (if there are no variadic buffers there will be
exactly
+ // three total buffers and we don't need to do anything special here).
Notably, this
+ // will occur when building a BinaryView/StringView array by element using
the
+ // appender.
+ struct ArrowBuffer* sizes_buffer = ArrowArrayBuffer(array,
array->n_buffers - 1);
+ if (sizes_buffer->data == NULL && sizes_buffer->size_bytes == 0) {
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferReserve(sizes_buffer, private_data->n_variadic_buffers));
+ for (int64_t i = 0; i < private_data->n_variadic_buffers; i++) {
+ struct ArrowBuffer* variadic_buffer =
+ ArrowArrayBuffer(array, i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS);
+ NANOARROW_RETURN_NOT_OK(
+ ArrowBufferAppendInt64(sizes_buffer, variadic_buffer->size_bytes));
+ }
+ }
+ }
- for (int32_t i = 0; i < nfixed_buf; i++) {
+ for (int32_t i = 0; i < array->n_buffers; i++) {
private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
}
- if (is_binary_view) {
- const int32_t nvirt_buf = private_data->n_variadic_buffers;
- private_data->buffer_data = (const void**)ArrowRealloc(
- private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf +
1));
- for (int32_t i = 0; i < nvirt_buf; i++) {
- private_data->buffer_data[nfixed_buf + i] =
private_data->variadic_buffers[i].data;
- }
- private_data->buffer_data[nfixed_buf + nvirt_buf] =
- private_data->variadic_buffer_sizes;
- array->buffers = (const void**)(private_data->buffer_data);
- }
+ array->buffers = (const void**)(private_data->buffer_data);
+ // Flush internal pointers for child/dictionary arrays if we allocated them.
Clients
+ // building arrays by buffer might have moved arrays from some other source
(e.g.,
+ // to create a record batch) and calling this function in that case will
cause a crash.
for (int64_t i = 0; i < array->n_children; i++) {
- ArrowArrayFlushInternalPointers(array->children[i]);
+ if (ArrowArrayIsInternal(array->children[i])) {
+
NANOARROW_RETURN_NOT_OK(ArrowArrayFlushInternalPointers(array->children[i]));
+ }
}
- if (array->dictionary != NULL) {
- ArrowArrayFlushInternalPointers(array->dictionary);
+ if (array->dictionary != NULL && ArrowArrayIsInternal(array->dictionary)) {
+
NANOARROW_RETURN_NOT_OK(ArrowArrayFlushInternalPointers(array->dictionary));
}
+
+ return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array,
@@ -519,7 +548,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray*
array,
// Make sure the value we get with array->buffers[i] is set to the actual
// pointer (which may have changed from the original due to reallocation)
- ArrowArrayFlushInternalPointers(array);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFlushInternalPointers(array),
error);
if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) {
return NANOARROW_OK;
@@ -527,8 +556,8 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray*
array,
// For validation, initialize an ArrowArrayView with our known buffer sizes
struct ArrowArrayView array_view;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view,
array),
- error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ ArrowArrayViewInitFromArray(&array_view, array, error), error);
int result = ArrowArrayViewValidate(&array_view, validation_level, error);
ArrowArrayViewReset(&array_view);
return result;
diff --git a/src/nanoarrow/common/array_test.cc
b/src/nanoarrow/common/array_test.cc
index a523a360..35a93787 100644
--- a/src/nanoarrow/common/array_test.cc
+++ b/src/nanoarrow/common/array_test.cc
@@ -216,6 +216,58 @@ TEST(ArrayTest, ArrayTestSetBuffer) {
ArrowArrayRelease(&array);
}
+TEST(ArrayTest, ArrayTestStringViewBuffer) {
+ // Build an array of size zero but with a few variadic buffers to ensure our
get/set
+ // buffer logic finds the correct buffers.
+
+ struct ArrowBuffer vbuffer0, vbuffer1, vbuffer2;
+ ArrowBufferInit(&vbuffer0);
+ ASSERT_EQ(ArrowBufferAppendStringView(&vbuffer0, ArrowCharView("banana
one")),
+ NANOARROW_OK);
+ ArrowBufferInit(&vbuffer1);
+ ASSERT_EQ(ArrowBufferAppendStringView(&vbuffer1, ArrowCharView("banana
two")),
+ NANOARROW_OK);
+ ArrowBufferInit(&vbuffer2);
+ ASSERT_EQ(ArrowBufferAppendStringView(&vbuffer2, ArrowCharView("banana
three")),
+ NANOARROW_OK);
+
+ struct ArrowArray array;
+ ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_STRING_VIEW),
NANOARROW_OK);
+ ASSERT_EQ(array.n_buffers, 3);
+
+ // Add variadic buffers
+ ASSERT_EQ(ArrowArrayAddVariadicBuffers(&array, 3), NANOARROW_OK);
+ ASSERT_EQ(array.n_buffers, 6);
+
+ // Set their content
+ ASSERT_EQ(ArrowArraySetBuffer(&array, 2, &vbuffer0), NANOARROW_OK);
+ ASSERT_EQ(ArrowArraySetBuffer(&array, 3, &vbuffer1), NANOARROW_OK);
+ ASSERT_EQ(ArrowArraySetBuffer(&array, 4, &vbuffer2), NANOARROW_OK);
+
+ // Ensure it is reflected in array->buffers
+ EXPECT_EQ(memcmp(array.buffers[2], "banana one", 10), 0);
+ EXPECT_EQ(memcmp(array.buffers[3], "banana two", 10), 0);
+ EXPECT_EQ(memcmp(array.buffers[4], "banana three", 12), 0);
+
+ // Sizes buffer has not been built yet
+ EXPECT_EQ(array.buffers[5], nullptr);
+
+ // ArrowArrayBuffer should find the correct buffers as well
+ EXPECT_EQ(ArrowArrayBuffer(&array, 2)->data, array.buffers[2]);
+ EXPECT_EQ(ArrowArrayBuffer(&array, 3)->data, array.buffers[3]);
+ EXPECT_EQ(ArrowArrayBuffer(&array, 4)->data, array.buffers[4]);
+ EXPECT_EQ(ArrowArrayBuffer(&array, 5)->data, array.buffers[5]);
+
+ // After we finish building, the sizes buffer should have been built for us
+ ASSERT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_NONE,
nullptr),
+ NANOARROW_OK);
+
+ int64_t expected_sizes[] = {10, 10, 12};
+ EXPECT_EQ(memcmp(array.buffers[5], &expected_sizes, sizeof(expected_sizes)),
0);
+
+ ArrowArrayRelease(&array);
+}
+
TEST(ArrayTest, ArrayTestBuildByBuffer) {
// the array ["a", null, "bc", null, "def", null, "ghij"]
uint8_t validity_bitmap[] = {0x55};
diff --git a/src/nanoarrow/common/inline_array.h
b/src/nanoarrow/common/inline_array.h
index 90fa9e6a..885276b4 100644
--- a/src/nanoarrow/common/inline_array.h
+++ b/src/nanoarrow/common/inline_array.h
@@ -43,8 +43,25 @@ static inline struct ArrowBuffer* ArrowArrayBuffer(struct
ArrowArray* array, int
switch (i) {
case 0:
return &private_data->bitmap.buffer;
+ case 1:
+ return private_data->buffers;
default:
- return private_data->buffers + i - 1;
+ if (array->n_buffers > 3 && i == (array->n_buffers - 1)) {
+ // The variadic buffer sizes buffer if for a BinaryView/String view
array
+ // is always stored in private_data->buffers[1]; however, from the
numbered
+ // buffers perspective this is the array->buffers[array->n_buffers -
1].
+ return private_data->buffers + 1;
+ } else if (array->n_buffers > 3) {
+ // If there are one or more variadic buffers, they are stored in
+ // private_data->variadic_buffers
+ return private_data->variadic_buffers + (i - 2);
+ } else {
+ // Otherwise, we're just accessing buffer at index 2 (e.g.,
String/Binary
+ // data buffer or variadic sizes buffer for the case where there are no
+ // variadic buffers)
+ NANOARROW_DCHECK(i == 2);
+ return private_data->buffers + i - 1;
+ }
}
}
@@ -527,9 +544,9 @@ static inline int32_t ArrowArrayVariadicBufferCount(struct
ArrowArray* array) {
}
static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray*
array,
- int32_t nbuffers) {
+ int32_t n_buffers) {
const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array);
- const int32_t nvariadic_bufs_needed = n_current_bufs + nbuffers;
+ const int32_t nvariadic_bufs_needed = n_current_bufs + n_buffers;
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
@@ -539,19 +556,24 @@ static inline ArrowErrorCode
ArrowArrayAddVariadicBuffers(struct ArrowArray* arr
if (private_data->variadic_buffers == NULL) {
return ENOMEM;
}
- private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc(
- private_data->variadic_buffer_sizes, sizeof(int64_t) *
nvariadic_bufs_needed);
- if (private_data->variadic_buffer_sizes == NULL) {
- return ENOMEM;
- }
+
+ private_data->n_variadic_buffers = nvariadic_bufs_needed;
+ array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 +
nvariadic_bufs_needed;
+
+ private_data->buffer_data = (const void**)ArrowRealloc(
+ private_data->buffer_data, array->n_buffers * sizeof(void*));
for (int32_t i = n_current_bufs; i < nvariadic_bufs_needed; i++) {
ArrowBufferInit(&private_data->variadic_buffers[i]);
- private_data->variadic_buffer_sizes[i] = 0;
+ private_data->buffer_data[NANOARROW_BINARY_VIEW_FIXED_BUFFERS + i] = NULL;
}
- private_data->n_variadic_buffers = nvariadic_bufs_needed;
- array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 +
nvariadic_bufs_needed;
+ // Zero out memory for the final buffer (variadic sizes buffer we haven't
built yet)
+ private_data->buffer_data[NANOARROW_BINARY_VIEW_FIXED_BUFFERS +
nvariadic_bufs_needed] =
+ NULL;
+
+ // Ensure array->buffers points to a valid value
+ array->buffers = private_data->buffer_data;
return NANOARROW_OK;
}
@@ -589,7 +611,6 @@ static inline ArrowErrorCode ArrowArrayAppendBytes(struct
ArrowArray* array,
bvt.ref.offset = (int32_t)variadic_buf->size_bytes;
NANOARROW_RETURN_NOT_OK(
ArrowBufferAppend(variadic_buf, value.data.as_char,
value.size_bytes));
- private_data->variadic_buffer_sizes[buf_index] =
variadic_buf->size_bytes;
}
NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &bvt, sizeof(bvt)));
} else {
diff --git a/src/nanoarrow/common/inline_types.h
b/src/nanoarrow/common/inline_types.h
index a8d194f1..8748c67f 100644
--- a/src/nanoarrow/common/inline_types.h
+++ b/src/nanoarrow/common/inline_types.h
@@ -870,9 +870,6 @@ struct ArrowArrayPrivateData {
// Variadic buffers for binary view types
struct ArrowBuffer* variadic_buffers;
- // Size of each variadic buffer in bytes
- int64_t* variadic_buffer_sizes;
-
// The current offset used to build list views
int64_t list_view_offset;
};
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index a32879a7..aa82d987 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -905,6 +905,12 @@ NANOARROW_DLL void ArrowArraySetValidityBitmap(struct
ArrowArray* array,
NANOARROW_DLL ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array,
int64_t i,
struct ArrowBuffer* buffer);
+/// \brief Add variadic buffers to a string or binary view array
+///
+/// array must have been allocated using ArrowArrayInitFromType()
+static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray*
array,
+ int32_t n_buffers);
+
/// \brief Get the validity bitmap of an ArrowArray
///
/// array must have been allocated using ArrowArrayInitFromType()