This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f6104407 fix: Ensure ArrowArrayBuffer() and ArrowArraySetBuffer() work 
for variadic buffers (#808)
f6104407 is described below

commit f6104407caf3c169ea76f2d39f0d2b8727995b67
Author: Dewey Dunnington <[email protected]>
AuthorDate: Mon Sep 29 09:43:53 2025 -0500

    fix: Ensure ArrowArrayBuffer() and ArrowArraySetBuffer() work for variadic 
buffers (#808)
    
    The high-level goal of this PR is to fix a bug where the R package
    couldn't create a shallow copy of a string view array. The R and Python
    packages create shallow copies by borrowing buffers from the original
    and constructing an array using `ArrowArraySetBuffer()`. Because
    `ArrowArraySetBuffer()` errored for any buffer other than 0, 1, and 2,
    string view arrays caused cryptic errors when they hit certain places in
    the R package.
    
    After this PR, `ArrowArrayBuffer()` and `ArrowArraySetBuffer()` work for
    any buffer, variadic or no. This means it's possible to create
    string/binary view arrays by buffer, even if it's mostly something we
    just do internally.
    
    I'd hoped this would be a more straightforward fix, but the assumption
    that there will always be <=3 buffers ran fairly deep.
---
 python/src/nanoarrow/_array.pyx     |  43 ++++++++------
 python/src/nanoarrow/_utils.pyx     |   6 ++
 python/src/nanoarrow/c_array.py     |   4 ++
 python/tests/test_c_array.py        |  30 ++++++++++
 r/R/array.R                         |   4 ++
 r/src/array.c                       |  25 +++++++-
 r/src/array.h                       |   9 +++
 r/src/init.c                        |   2 +
 r/tests/testthat/test-array.R       |  40 ++++++++++++-
 src/nanoarrow/common/array.c        | 115 ++++++++++++++++++++++--------------
 src/nanoarrow/common/array_test.cc  |  52 ++++++++++++++++
 src/nanoarrow/common/inline_array.h |  45 ++++++++++----
 src/nanoarrow/common/inline_types.h |   3 -
 src/nanoarrow/nanoarrow.h           |   6 ++
 14 files changed, 303 insertions(+), 81 deletions(-)

diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx
index 869087b5..c01e0c5a 100644
--- a/python/src/nanoarrow/_array.pyx
+++ b/python/src/nanoarrow/_array.pyx
@@ -32,6 +32,7 @@ from cpython cimport (
 
 from nanoarrow_c cimport (
     ArrowArray,
+    ArrowArrayAddVariadicBuffers,
     ArrowArrayAppendBytes,
     ArrowArrayAppendNull,
     ArrowArrayAppendString,
@@ -721,6 +722,13 @@ cdef class CArrayBuilder:
         self._ptr.null_count = self._ptr.length - count
         return self
 
+    def ensure_buffers(self, n_buffers) -> CArrayBuilder:
+        if self._ptr.n_buffers < n_buffers:
+            code = ArrowArrayAddVariadicBuffers(self._ptr, n_buffers - 
self._ptr.n_buffers)
+            Error.raise_error_not_ok("ArrowArrayAddVariadicBuffers() failed", 
code)
+
+        return self
+
     def set_buffer(self, int64_t i, CBuffer buffer, move=False) -> 
CArrayBuilder:
         """Set an ArrowArray buffer
 
@@ -732,8 +740,8 @@ cdef class CArrayBuilder:
         is False (the default), this function will a make a shallow copy via 
another
         layer of Python object wrapping.
         """
-        if i < 0 or i > 3:
-            raise IndexError("i must be >= 0 and <= 3")
+        if i < 0 or i >= self._ptr.n_buffers:
+            raise IndexError(f"i must be >= 0 and < {self._ptr.n_buffers}")
 
         if buffer._device != self._device:
             raise ValueError(
@@ -749,11 +757,6 @@ cdef class CArrayBuilder:
 
         ArrowBufferMove(buffer._ptr, ArrowArrayBuffer(self._ptr, i))
 
-        # The buffer's lifecycle is now owned by the array; however, we need
-        # array->buffers[i] to be updated such that it equals
-        # ArrowArrayBuffer(array, i)->data.
-        self._ptr.buffers[i] = ArrowArrayBuffer(self._ptr, i).data
-
         return self
 
     def set_child(self, int64_t i, CArray c_array, move=False) -> 
CArrayBuilder:
@@ -819,20 +822,22 @@ cdef class CArrayBuilder:
         cdef Error error = Error()
         cdef int code
 
-        if self._can_validate:
-            c_validation_level = NANOARROW_VALIDATION_LEVEL_DEFAULT
-            if validation_level == "full":
-                c_validation_level = NANOARROW_VALIDATION_LEVEL_FULL
-            elif validation_level == "minimal":
-                c_validation_level = NANOARROW_VALIDATION_LEVEL_MINIMAL
-            elif validation_level == "none":
-                c_validation_level = NANOARROW_VALIDATION_LEVEL_NONE
+        if not self._can_validate and validation_level not in (None, "none"):
+            raise NotImplementedError("Validation for array with children is 
not implemented")
 
-            code = ArrowArrayFinishBuilding(self._ptr, c_validation_level, 
&error.c_error)
-            error.raise_message_not_ok("ArrowArrayFinishBuildingDefault()", 
code)
+        if not self._can_validate:
+            validation_level = "none"
 
-        elif validation_level not in (None, "none"):
-            raise NotImplementedError("Validation for array with children is 
not implemented")
+        c_validation_level = NANOARROW_VALIDATION_LEVEL_DEFAULT
+        if validation_level == "full":
+            c_validation_level = NANOARROW_VALIDATION_LEVEL_FULL
+        elif validation_level == "minimal":
+            c_validation_level = NANOARROW_VALIDATION_LEVEL_MINIMAL
+        elif validation_level == "none":
+            c_validation_level = NANOARROW_VALIDATION_LEVEL_NONE
+
+        code = ArrowArrayFinishBuilding(self._ptr, c_validation_level, 
&error.c_error)
+        error.raise_message_not_ok("ArrowArrayFinishBuildingDefault()", code)
 
         out = self.c_array
         self.c_array = CArray.allocate(CSchema.allocate())
diff --git a/python/src/nanoarrow/_utils.pyx b/python/src/nanoarrow/_utils.pyx
index b261fb29..63943b46 100644
--- a/python/src/nanoarrow/_utils.pyx
+++ b/python/src/nanoarrow/_utils.pyx
@@ -32,6 +32,7 @@ from cpython.ref cimport Py_INCREF, Py_DECREF
 
 from nanoarrow_c cimport (
     ArrowArray,
+    ArrowArrayAddVariadicBuffers,
     ArrowArrayAllocateChildren,
     ArrowArrayAllocateDictionary,
     ArrowArrayBuffer,
@@ -363,6 +364,11 @@ cdef void c_array_shallow_copy(object base, const 
ArrowArray* src, ArrowArray* d
     tmp.offset = src.offset
     tmp.null_count = src.null_count
 
+    # We might need some more buffers if we are shallowly copying a 
string/binary view
+    if src.n_buffers > 3:
+        code = ArrowArrayAddVariadicBuffers(dst, src.n_buffers - 3)
+        Error.raise_error_not_ok("ArrowArrayAddVariadicBuffers()", code)
+
     for i in range(src.n_buffers):
         if src.buffers[i] != NULL:
             # The purpose of this buffer is soley so that we can use the
diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
index 57390b7c..e2131bf4 100644
--- a/python/src/nanoarrow/c_array.py
+++ b/python/src/nanoarrow/c_array.py
@@ -278,6 +278,10 @@ def c_array_from_buffers(
     # of children have been initialized.
     builder.init_from_schema(schema)
 
+    # We might need more buffers if we're constructing a string or binary view
+    buffers = list(buffers)
+    builder.ensure_buffers(len(buffers))
+
     # Set buffers, optionally moving ownership of the buffers as well (i.e.,
     # the objects in the input buffers would be replaced with an empty 
ArrowBuffer)
     for i, buffer in enumerate(buffers):
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index ae7b80a4..9959bf03 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -501,6 +501,36 @@ def test_c_array_from_buffers_recursive():
         )
 
 
+def test_c_array_from_buffers_string_view():
+    # Creating the actual view buffer is hard, but we can at least make sure 
the
+    # buffers are able to be roundtripped.
+    c_array = na.c_array_from_buffers(
+        na.string_view(),
+        0,
+        [
+            None,
+            None,
+            b"banana one",
+            b"banana two",
+            b"banana three",
+            na.c_buffer([10, 10, 12], na.int64()),
+        ],
+        validation_level="none",
+    )
+    assert c_array.length == 0
+    assert c_array.null_count == 0
+    assert c_array.offset == 0
+
+    array_view = c_array.view()
+    assert array_view.storage_type == "string_view"
+    assert bytes(array_view.buffer(0)) == b""
+    assert bytes(array_view.buffer(1)) == b""
+    assert bytes(array_view.buffer(2)) == b"banana one"
+    assert bytes(array_view.buffer(3)) == b"banana two"
+    assert bytes(array_view.buffer(4)) == b"banana three"
+    assert list(array_view.buffer(5)) == [10, 10, 12]
+
+
 def test_c_array_from_buffers_validation():
     # Should fail with all validation levels except none
     for validation_level in ("full", "default", "minimal"):
diff --git a/r/R/array.R b/r/R/array.R
index 5347dc9e..2aabaf09 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -332,6 +332,10 @@ nanoarrow_array_modify <- function(array, new_values, 
validate = TRUE) {
     )
   }
 
+  # Finish building (e.g., ensure pointers are flushed)
+  .Call(nanoarrow_c_array_finish_building, array_copy)
+
+  # Validate if requested
   if (!is.null(schema) && validate) {
     array_copy <- .Call(nanoarrow_c_array_validate_after_modify, array_copy, 
schema)
   }
diff --git a/r/src/array.c b/r/src/array.c
index c4603b11..fb4eef21 100644
--- a/r/src/array.c
+++ b/r/src/array.c
@@ -93,8 +93,8 @@ SEXP nanoarrow_c_array_set_buffers(SEXP array_xptr, SEXP 
buffers_sexp) {
   struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr);
 
   int64_t n_buffers = Rf_xlength(buffers_sexp);
-  if (n_buffers > 3) {
-    Rf_error("length(array$buffers) must be <= 3");
+  if (n_buffers != array->n_buffers) {
+    Rf_error("Changing the number of buffers in array_modify is not 
supported");
   }
 
   // Release any buffers that aren't about to be replaced
@@ -248,6 +248,17 @@ static int move_array_buffers(struct ArrowArray* src, 
struct ArrowArray* dst,
   return NANOARROW_OK;
 }
 
+SEXP nanoarrow_c_array_finish_building(SEXP array_xptr) {
+  struct ArrowArray* array = nanoarrow_array_from_xptr(array_xptr);
+  struct ArrowError error;
+  int result = ArrowArrayFinishBuilding(array, 
NANOARROW_VALIDATION_LEVEL_NONE, &error);
+  if (result != NANOARROW_OK) {
+    Rf_error("ArrowArrayFinishBuilding(): %s", error.message);
+  }
+
+  return R_NilValue;
+}
+
 SEXP nanoarrow_c_array_validate_after_modify(SEXP array_xptr, SEXP 
schema_xptr) {
   // A very particular type of validation we can do with the ArrowArray we use
   // in nanoarrow_array_modify() (which was created using ArrowArrayInit).
@@ -271,6 +282,16 @@ SEXP nanoarrow_c_array_validate_after_modify(SEXP 
array_xptr, SEXP schema_xptr)
     Rf_error("ArrowArrayInitFromSchema(): %s", error.message);
   }
 
+  // Add any variadic buffers that might be required
+  if ((array->n_buffers > array_dst->n_buffers) &&
+      array->n_buffers > NANOARROW_MAX_FIXED_BUFFERS) {
+    result =
+        ArrowArrayAddVariadicBuffers(array_dst, array->n_buffers - 
array_dst->n_buffers);
+    if (result != NANOARROW_OK) {
+      Rf_error("ArrowArrayAddVariadicBuffers() failed");
+    }
+  }
+
   result = move_array_buffers(array, array_dst, schema, &error);
   if (result != NANOARROW_OK) {
     Rf_error("move_array_buffers: %s", error.message);
diff --git a/r/src/array.h b/r/src/array.h
index ebe965eb..96c2849a 100644
--- a/r/src/array.h
+++ b/r/src/array.h
@@ -94,6 +94,15 @@ static inline void array_export(SEXP array_xptr, struct 
ArrowArray* array_copy)
   array_copy->offset = array->offset;
 
   // Get buffer references, each of which preserve a reference to 
independent_array_xptr
+
+  // We might need some more buffers to be allocated for string views
+  if (array->n_buffers > 3) {
+    result = ArrowArrayAddVariadicBuffers(array_copy, array->n_buffers - 3);
+    if (result != NANOARROW_OK) {
+      Rf_error("ArrowArrayAddVariadicBuffers() failed");
+    }
+  }
+
   array_copy->n_buffers = array->n_buffers;
   for (int64_t i = 0; i < array->n_buffers; i++) {
     SEXP borrowed_buffer =
diff --git a/r/src/init.c b/r/src/init.c
index e3288dd4..4d2109a2 100644
--- a/r/src/init.c
+++ b/r/src/init.c
@@ -40,6 +40,7 @@ extern SEXP nanoarrow_c_array_set_offset(SEXP array_xptr, 
SEXP offset_sexp);
 extern SEXP nanoarrow_c_array_set_buffers(SEXP array_xptr, SEXP buffers_sexp);
 extern SEXP nanoarrow_c_array_set_children(SEXP array_xptr, SEXP 
children_sexp);
 extern SEXP nanoarrow_c_array_set_dictionary(SEXP array_xptr, SEXP 
dictionary_xptr);
+extern SEXP nanoarrow_c_array_finish_building(SEXP array_xptr);
 extern SEXP nanoarrow_c_array_validate_after_modify(SEXP array_xptr, SEXP 
schema_xptr);
 extern SEXP nanoarrow_c_array_set_schema(SEXP array_xptr, SEXP schema_xptr,
                                          SEXP validate_sexp);
@@ -121,6 +122,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"nanoarrow_c_array_set_buffers", (DL_FUNC)&nanoarrow_c_array_set_buffers, 
2},
     {"nanoarrow_c_array_set_children", 
(DL_FUNC)&nanoarrow_c_array_set_children, 2},
     {"nanoarrow_c_array_set_dictionary", 
(DL_FUNC)&nanoarrow_c_array_set_dictionary, 2},
+    {"nanoarrow_c_array_finish_building", 
(DL_FUNC)&nanoarrow_c_array_finish_building, 1},
     {"nanoarrow_c_array_validate_after_modify",
      (DL_FUNC)&nanoarrow_c_array_validate_after_modify, 2},
     {"nanoarrow_c_array_set_schema", (DL_FUNC)&nanoarrow_c_array_set_schema, 
3},
diff --git a/r/tests/testthat/test-array.R b/r/tests/testthat/test-array.R
index ae737aed..e2169d19 100644
--- a/r/tests/testthat/test-array.R
+++ b/r/tests/testthat/test-array.R
@@ -342,16 +342,52 @@ test_that("array modify can modify buffers", {
   array <- as_nanoarrow_array(1:5)
   expect_error(
     nanoarrow_array_modify(array, list(buffers = rep(list(NULL), 4))),
-    "must be <= 3"
+    "Changing the number of buffers in array_modify is not supported"
   )
 
   # Check that specifying too few buffers will result in a validation error
   expect_error(
     nanoarrow_array_modify(array, list(buffers = list()), validate = TRUE),
-    "Expected 2 buffer"
+    "Changing the number of buffers in array_modify is not supported"
   )
 })
 
+test_that("array modify can modify variadic buffers", {
+  # Create a string view array with >1 variadic buffers. The default
+  # internal threshold for splitting up internal buffers is ~30kb.
+  boring_strings <- vapply(
+    1:10000,
+    function(i) paste0("boring string", i),
+    character(1)
+  )
+
+  array <- as_nanoarrow_array(boring_strings, schema = na_string_view())
+  expect_identical(length(array$buffers), 9L)
+  variadic_sizes <- convert_buffer(array$buffers[[9]])
+  expect_identical(variadic_sizes, c(32757, 32759, 32759, 32759, 32759, 5101))
+  expect_identical(convert_array(array), boring_strings)
+
+  # Save the original array
+  original_array <- array
+
+  # Modify one of the array buffers
+  cool_string_bytes <- charToRaw("cooool string1")
+  first_buffer <- as.raw(array$buffers[[3]])
+  first_buffer[seq_along(cool_string_bytes)] <- cool_string_bytes
+
+  array$buffers[[3]] <- first_buffer
+
+  cool_strings <- boring_strings
+  cool_strings[1] <- "cooool string1"
+  expect_identical(convert_array(array), cool_strings)
+
+  variadic_sizes <- convert_buffer(array$buffers[[9]])
+  expect_identical(variadic_sizes, c(32757, 32759, 32759, 32759, 32759, 5101))
+
+  # Check that the original was unmodified
+  expect_identical(convert_array(original_array), boring_strings)
+})
+
 test_that("array modify checks buffer sizes", {
   array <- as_nanoarrow_array(1:5)
   expect_error(
diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c
index 0ea48c7a..2f0f824c 100644
--- a/src/nanoarrow/common/array.c
+++ b/src/nanoarrow/common/array.c
@@ -38,7 +38,6 @@ static void ArrowArrayReleaseInternal(struct ArrowArray* 
array) {
       ArrowBufferReset(&private_data->variadic_buffers[i]);
     }
     ArrowFree(private_data->variadic_buffers);
-    ArrowFree(private_data->variadic_buffer_sizes);
     ArrowFree(private_data);
   }
 
@@ -74,6 +73,10 @@ static void ArrowArrayReleaseInternal(struct ArrowArray* 
array) {
   array->release = NULL;
 }
 
+static int ArrowArrayIsInternal(struct ArrowArray* array) {
+  return array->release == &ArrowArrayReleaseInternal;
+}
+
 static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
                                                enum ArrowType storage_type) {
   switch (storage_type) {
@@ -170,7 +173,6 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* 
array,
   }
   private_data->n_variadic_buffers = 0;
   private_data->variadic_buffers = NULL;
-  private_data->variadic_buffer_sizes = NULL;
   private_data->list_view_offset = 0;
 
   array->private_data = private_data;
@@ -333,25 +335,35 @@ ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* 
array, int64_t i,
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
-  switch (i) {
-    case 0:
-      ArrowBufferMove(buffer, &private_data->bitmap.buffer);
-      private_data->buffer_data[i] = private_data->bitmap.buffer.data;
-      break;
-    case 1:
-    case 2:
-      ArrowBufferMove(buffer, &private_data->buffers[i - 1]);
-      private_data->buffer_data[i] = private_data->buffers[i - 1].data;
-      break;
-    default:
-      return EINVAL;
+  if (i >= array->n_buffers || i < 0) {
+    return EINVAL;
   }
 
+  // Find the `i`th buffer, release what is currently there, and move the
+  // supplied buffer into that slot.
+  struct ArrowBuffer* dst = ArrowArrayBuffer(array, i);
+  ArrowBufferReset(dst);
+  ArrowBufferMove(buffer, dst);
+
+  // Flush the pointer into array->buffers. In theory clients should call
+  // ArrowArrayFinishBuilding() to flush the pointer values before passing
+  // this array elsewhere; however, in early nanoarrow versions this was not
+  // needed and some code may depend on this being true.
+  private_data->buffer_data[i] = dst->data;
+  array->buffers = private_data->buffer_data;
+
   return NANOARROW_OK;
 }
 
 static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* 
array_view,
-                                                  struct ArrowArray* array) {
+                                                  struct ArrowArray* array,
+                                                  struct ArrowError* error) {
+  if (!ArrowArrayIsInternal(array)) {
+    ArrowErrorSet(error,
+                  "Can't initialize internal ArrowArrayView from external 
ArrowArray");
+    return EINVAL;
+  }
+
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
@@ -376,7 +388,8 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct 
ArrowArrayView* array_v
   }
 
   for (int64_t i = 0; i < array->n_children; i++) {
-    result = ArrowArrayViewInitFromArray(array_view->children[i], 
array->children[i]);
+    result =
+        ArrowArrayViewInitFromArray(array_view->children[i], 
array->children[i], error);
     if (result != NANOARROW_OK) {
       ArrowArrayViewReset(array_view);
       return result;
@@ -390,7 +403,8 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct 
ArrowArrayView* array_v
       return result;
     }
 
-    result = ArrowArrayViewInitFromArray(array_view->dictionary, 
array->dictionary);
+    result =
+        ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary, 
error);
     if (result != NANOARROW_OK) {
       ArrowArrayViewReset(array_view);
       return result;
@@ -403,7 +417,7 @@ static ArrowErrorCode ArrowArrayViewInitFromArray(struct 
ArrowArrayView* array_v
 static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array,
                                                 struct ArrowArrayView* 
array_view) {
   // Loop through buffers and reserve the extra space that we know about
-  for (int64_t i = 0; i < array->n_buffers; i++) {
+  for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
     // Don't reserve on a validity buffer that hasn't been allocated yet
     if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY &&
         ArrowArrayBuffer(array, i)->data == NULL) {
@@ -431,7 +445,7 @@ static ArrowErrorCode ArrowArrayReserveInternal(struct 
ArrowArray* array,
 ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array,
                                  int64_t additional_size_elements) {
   struct ArrowArrayView array_view;
-  NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array, 
NULL));
 
   // Calculate theoretical buffer sizes (recursively)
   ArrowArrayViewSetLength(&array_view, array->length + 
additional_size_elements);
@@ -463,47 +477,62 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct 
ArrowArray* array) {
   }
 
   for (int64_t i = 0; i < array->n_children; i++) {
-    NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
+    if (ArrowArrayIsInternal(array->children[i])) {
+      NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
+    }
   }
 
-  if (array->dictionary != NULL) {
+  if (array->dictionary != NULL && ArrowArrayIsInternal(array->dictionary)) {
     NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary));
   }
 
   return NANOARROW_OK;
 }
 
-static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) {
+static ArrowErrorCode ArrowArrayFlushInternalPointers(struct ArrowArray* 
array) {
+  NANOARROW_DCHECK(ArrowArrayIsInternal(array));
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
 
-  const bool is_binary_view = private_data->storage_type == 
NANOARROW_TYPE_STRING_VIEW ||
-                              private_data->storage_type == 
NANOARROW_TYPE_BINARY_VIEW;
-  const int32_t nfixed_buf = is_binary_view ? 2 : NANOARROW_MAX_FIXED_BUFFERS;
+  if (array->n_buffers > NANOARROW_MAX_FIXED_BUFFERS) {
+    // If the variadic sizes buffer was not set and there is at least one 
variadic
+    // buffer, populate it now (if there are no variadic buffers there will be 
exactly
+    // three total buffers and we don't need to do anything special here). 
Notably, this
+    // will occur when building a BinaryView/StringView array by element using 
the
+    // appender.
+    struct ArrowBuffer* sizes_buffer = ArrowArrayBuffer(array, 
array->n_buffers - 1);
+    if (sizes_buffer->data == NULL && sizes_buffer->size_bytes == 0) {
+      NANOARROW_RETURN_NOT_OK(
+          ArrowBufferReserve(sizes_buffer, private_data->n_variadic_buffers));
+      for (int64_t i = 0; i < private_data->n_variadic_buffers; i++) {
+        struct ArrowBuffer* variadic_buffer =
+            ArrowArrayBuffer(array, i + NANOARROW_BINARY_VIEW_FIXED_BUFFERS);
+        NANOARROW_RETURN_NOT_OK(
+            ArrowBufferAppendInt64(sizes_buffer, variadic_buffer->size_bytes));
+      }
+    }
+  }
 
-  for (int32_t i = 0; i < nfixed_buf; i++) {
+  for (int32_t i = 0; i < array->n_buffers; i++) {
     private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
   }
 
-  if (is_binary_view) {
-    const int32_t nvirt_buf = private_data->n_variadic_buffers;
-    private_data->buffer_data = (const void**)ArrowRealloc(
-        private_data->buffer_data, sizeof(void*) * (nfixed_buf + nvirt_buf + 
1));
-    for (int32_t i = 0; i < nvirt_buf; i++) {
-      private_data->buffer_data[nfixed_buf + i] = 
private_data->variadic_buffers[i].data;
-    }
-    private_data->buffer_data[nfixed_buf + nvirt_buf] =
-        private_data->variadic_buffer_sizes;
-    array->buffers = (const void**)(private_data->buffer_data);
-  }
+  array->buffers = (const void**)(private_data->buffer_data);
 
+  // Flush internal pointers for child/dictionary arrays if we allocated them. 
Clients
+  // building arrays by buffer might have moved arrays from some other source 
(e.g.,
+  // to create a record batch) and calling this function in that case will 
cause a crash.
   for (int64_t i = 0; i < array->n_children; i++) {
-    ArrowArrayFlushInternalPointers(array->children[i]);
+    if (ArrowArrayIsInternal(array->children[i])) {
+      
NANOARROW_RETURN_NOT_OK(ArrowArrayFlushInternalPointers(array->children[i]));
+    }
   }
 
-  if (array->dictionary != NULL) {
-    ArrowArrayFlushInternalPointers(array->dictionary);
+  if (array->dictionary != NULL && ArrowArrayIsInternal(array->dictionary)) {
+    
NANOARROW_RETURN_NOT_OK(ArrowArrayFlushInternalPointers(array->dictionary));
   }
+
+  return NANOARROW_OK;
 }
 
 ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array,
@@ -519,7 +548,7 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* 
array,
 
   // Make sure the value we get with array->buffers[i] is set to the actual
   // pointer (which may have changed from the original due to reallocation)
-  ArrowArrayFlushInternalPointers(array);
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFlushInternalPointers(array), 
error);
 
   if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) {
     return NANOARROW_OK;
@@ -527,8 +556,8 @@ ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* 
array,
 
   // For validation, initialize an ArrowArrayView with our known buffer sizes
   struct ArrowArrayView array_view;
-  NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, 
array),
-                                     error);
+  NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+      ArrowArrayViewInitFromArray(&array_view, array, error), error);
   int result = ArrowArrayViewValidate(&array_view, validation_level, error);
   ArrowArrayViewReset(&array_view);
   return result;
diff --git a/src/nanoarrow/common/array_test.cc 
b/src/nanoarrow/common/array_test.cc
index a523a360..35a93787 100644
--- a/src/nanoarrow/common/array_test.cc
+++ b/src/nanoarrow/common/array_test.cc
@@ -216,6 +216,58 @@ TEST(ArrayTest, ArrayTestSetBuffer) {
   ArrowArrayRelease(&array);
 }
 
+TEST(ArrayTest, ArrayTestStringViewBuffer) {
+  // Build an array of size zero but with a few variadic buffers to ensure our 
get/set
+  // buffer logic finds the correct buffers.
+
+  struct ArrowBuffer vbuffer0, vbuffer1, vbuffer2;
+  ArrowBufferInit(&vbuffer0);
+  ASSERT_EQ(ArrowBufferAppendStringView(&vbuffer0, ArrowCharView("banana 
one")),
+            NANOARROW_OK);
+  ArrowBufferInit(&vbuffer1);
+  ASSERT_EQ(ArrowBufferAppendStringView(&vbuffer1, ArrowCharView("banana 
two")),
+            NANOARROW_OK);
+  ArrowBufferInit(&vbuffer2);
+  ASSERT_EQ(ArrowBufferAppendStringView(&vbuffer2, ArrowCharView("banana 
three")),
+            NANOARROW_OK);
+
+  struct ArrowArray array;
+  ASSERT_EQ(ArrowArrayInitFromType(&array, NANOARROW_TYPE_STRING_VIEW), 
NANOARROW_OK);
+  ASSERT_EQ(array.n_buffers, 3);
+
+  // Add variadic buffers
+  ASSERT_EQ(ArrowArrayAddVariadicBuffers(&array, 3), NANOARROW_OK);
+  ASSERT_EQ(array.n_buffers, 6);
+
+  // Set their content
+  ASSERT_EQ(ArrowArraySetBuffer(&array, 2, &vbuffer0), NANOARROW_OK);
+  ASSERT_EQ(ArrowArraySetBuffer(&array, 3, &vbuffer1), NANOARROW_OK);
+  ASSERT_EQ(ArrowArraySetBuffer(&array, 4, &vbuffer2), NANOARROW_OK);
+
+  // Ensure it is reflected in array->buffers
+  EXPECT_EQ(memcmp(array.buffers[2], "banana one", 10), 0);
+  EXPECT_EQ(memcmp(array.buffers[3], "banana two", 10), 0);
+  EXPECT_EQ(memcmp(array.buffers[4], "banana three", 12), 0);
+
+  // Sizes buffer has not been built yet
+  EXPECT_EQ(array.buffers[5], nullptr);
+
+  // ArrowArrayBuffer should find the correct buffers as well
+  EXPECT_EQ(ArrowArrayBuffer(&array, 2)->data, array.buffers[2]);
+  EXPECT_EQ(ArrowArrayBuffer(&array, 3)->data, array.buffers[3]);
+  EXPECT_EQ(ArrowArrayBuffer(&array, 4)->data, array.buffers[4]);
+  EXPECT_EQ(ArrowArrayBuffer(&array, 5)->data, array.buffers[5]);
+
+  // After we finish building, the sizes buffer should have been built for us
+  ASSERT_EQ(ArrowArrayFinishBuilding(&array, NANOARROW_VALIDATION_LEVEL_NONE, 
nullptr),
+            NANOARROW_OK);
+
+  int64_t expected_sizes[] = {10, 10, 12};
+  EXPECT_EQ(memcmp(array.buffers[5], &expected_sizes, sizeof(expected_sizes)), 
0);
+
+  ArrowArrayRelease(&array);
+}
+
 TEST(ArrayTest, ArrayTestBuildByBuffer) {
   // the array ["a", null, "bc", null, "def", null, "ghij"]
   uint8_t validity_bitmap[] = {0x55};
diff --git a/src/nanoarrow/common/inline_array.h 
b/src/nanoarrow/common/inline_array.h
index 90fa9e6a..885276b4 100644
--- a/src/nanoarrow/common/inline_array.h
+++ b/src/nanoarrow/common/inline_array.h
@@ -43,8 +43,25 @@ static inline struct ArrowBuffer* ArrowArrayBuffer(struct 
ArrowArray* array, int
   switch (i) {
     case 0:
       return &private_data->bitmap.buffer;
+    case 1:
+      return private_data->buffers;
     default:
-      return private_data->buffers + i - 1;
+      if (array->n_buffers > 3 && i == (array->n_buffers - 1)) {
+        // The variadic buffer sizes buffer if for a BinaryView/String view 
array
+        // is always stored in private_data->buffers[1]; however, from the 
numbered
+        // buffers perspective this is the array->buffers[array->n_buffers - 
1].
+        return private_data->buffers + 1;
+      } else if (array->n_buffers > 3) {
+        // If there are one or more variadic buffers, they are stored in
+        // private_data->variadic_buffers
+        return private_data->variadic_buffers + (i - 2);
+      } else {
+        // Otherwise, we're just accessing buffer at index 2 (e.g., 
String/Binary
+        // data buffer or variadic sizes buffer for the case where there are no
+        // variadic buffers)
+        NANOARROW_DCHECK(i == 2);
+        return private_data->buffers + i - 1;
+      }
   }
 }
 
@@ -527,9 +544,9 @@ static inline int32_t ArrowArrayVariadicBufferCount(struct 
ArrowArray* array) {
 }
 
 static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray* 
array,
-                                                          int32_t nbuffers) {
+                                                          int32_t n_buffers) {
   const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array);
-  const int32_t nvariadic_bufs_needed = n_current_bufs + nbuffers;
+  const int32_t nvariadic_bufs_needed = n_current_bufs + n_buffers;
 
   struct ArrowArrayPrivateData* private_data =
       (struct ArrowArrayPrivateData*)array->private_data;
@@ -539,19 +556,24 @@ static inline ArrowErrorCode 
ArrowArrayAddVariadicBuffers(struct ArrowArray* arr
   if (private_data->variadic_buffers == NULL) {
     return ENOMEM;
   }
-  private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc(
-      private_data->variadic_buffer_sizes, sizeof(int64_t) * 
nvariadic_bufs_needed);
-  if (private_data->variadic_buffer_sizes == NULL) {
-    return ENOMEM;
-  }
+
+  private_data->n_variadic_buffers = nvariadic_bufs_needed;
+  array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 + 
nvariadic_bufs_needed;
+
+  private_data->buffer_data = (const void**)ArrowRealloc(
+      private_data->buffer_data, array->n_buffers * sizeof(void*));
 
   for (int32_t i = n_current_bufs; i < nvariadic_bufs_needed; i++) {
     ArrowBufferInit(&private_data->variadic_buffers[i]);
-    private_data->variadic_buffer_sizes[i] = 0;
+    private_data->buffer_data[NANOARROW_BINARY_VIEW_FIXED_BUFFERS + i] = NULL;
   }
-  private_data->n_variadic_buffers = nvariadic_bufs_needed;
-  array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 + 
nvariadic_bufs_needed;
 
+  // Zero out memory for the final buffer (variadic sizes buffer we haven't 
built yet)
+  private_data->buffer_data[NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 
nvariadic_bufs_needed] =
+      NULL;
+
+  // Ensure array->buffers points to a valid value
+  array->buffers = private_data->buffer_data;
   return NANOARROW_OK;
 }
 
@@ -589,7 +611,6 @@ static inline ArrowErrorCode ArrowArrayAppendBytes(struct 
ArrowArray* array,
       bvt.ref.offset = (int32_t)variadic_buf->size_bytes;
       NANOARROW_RETURN_NOT_OK(
           ArrowBufferAppend(variadic_buf, value.data.as_char, 
value.size_bytes));
-      private_data->variadic_buffer_sizes[buf_index] = 
variadic_buf->size_bytes;
     }
     NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &bvt, sizeof(bvt)));
   } else {
diff --git a/src/nanoarrow/common/inline_types.h 
b/src/nanoarrow/common/inline_types.h
index a8d194f1..8748c67f 100644
--- a/src/nanoarrow/common/inline_types.h
+++ b/src/nanoarrow/common/inline_types.h
@@ -870,9 +870,6 @@ struct ArrowArrayPrivateData {
   // Variadic buffers for binary view types
   struct ArrowBuffer* variadic_buffers;
 
-  // Size of each variadic buffer in bytes
-  int64_t* variadic_buffer_sizes;
-
   // The current offset used to build list views
   int64_t list_view_offset;
 };
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index a32879a7..aa82d987 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -905,6 +905,12 @@ NANOARROW_DLL void ArrowArraySetValidityBitmap(struct 
ArrowArray* array,
 NANOARROW_DLL ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, 
int64_t i,
                                                  struct ArrowBuffer* buffer);
 
+/// \brief Add variadic buffers to a string or binary view array
+///
+/// array must have been allocated using ArrowArrayInitFromType()
+static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray* 
array,
+                                                          int32_t n_buffers);
+
 /// \brief Get the validity bitmap of an ArrowArray
 ///
 /// array must have been allocated using ArrowArrayInitFromType()

Reply via email to