(arrow-nanoarrow) branch main updated: chore(python): Restructure buffer packing to support nulls and improve performance (#426)

paleolimbot Mon, 15 Apr 2024 07:34:21 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 8e8e38d3 chore(python): Restructure buffer packing to support nulls 
and improve performance (#426)
8e8e38d3 is described below

commit 8e8e38d3890a16348c82d1c83a27d8af56085b25
Author: Dewey Dunnington <de...@dunnington.ca>
AuthorDate: Mon Apr 15 11:34:13 2024 -0300

    chore(python): Restructure buffer packing to support nulls and improve 
performance (#426)
    
    First, this PR fixes the rather uninformative error that occurs on any
    error while building an Array (closes #423). The error is now:
    
    ```python
    import nanoarrow as na
    na.Array([1, 2, 3])
    #> ValueError
    #> ...
    #> An error occurred whilst converting object of type list to 
nanoarrow.c_array_stream or nanoarrow.c_array:
    #> schema is required for CArray import from iterable
    ```
    
    Second, this PR adds support for `None` in iterables. This makes it much
    more convenient to create arrays with nulls (closes #424).
    
    ```python
    import nanoarrow as na
    na.Array([1, 2, None, 4], na.int32())
    #> nanoarrow.Array<int32>[4]
    #> 1
    #> 2
    #> None
    #> 4
    ```
    
    Finally, this PR tweaks the implementation of packing an iterable into a
    buffer to avoid the very bad performance that existed previously. The
    optimizations added were:
    
    - The `CBufferBuilder` now implements the buffer protocol (so that we
    can use `pack_into`)
    - The `__len__` attribute is checked to preallocate where possible
    
    Those optimizations resulted in a ~2x improvement over the previous
    code; however, the types that can use the `array` constructor have the
    biggest wins (5-6x improvement).
    
    An example with the biggest gain:
    
    ```python
    import numpy as np
    import nanoarrow as na
    import pyarrow as pa
    
    floats = np.random.random(int(1e6))
    floats_lst = list(floats)
    
    %timeit pa.array(floats, pa.float64())
    #> 1.79 µs ± 9.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops 
each)
    %timeit pa.array(floats_lst, pa.float64())
    #> 13.8 ms ± 35.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit pa.array(iter(floats_lst), pa.float64())
    #> 17.9 ms ± 37.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    
    %timeit na.c_array(floats, na.float64())
    #> 5.51 µs ± 25.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops 
each)
    %timeit na.c_array(floats_lst, na.float64(nullable=False))
    #> 16.5 ms ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %timeit na.c_array(iter(floats_lst), na.float64(nullable=False))
    #> 29.1 ms ± 254 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    %timeit na.c_array(floats_lst, na.float64())
    #> 43.6 ms ± 484 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    %timeit na.c_array(iter(floats_lst), na.float64())
    #> 43 ms ± 227 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    ```
    
    Before this PR:
    
    ```python
    %timeit na.c_array(floats, na.float64())
    #> 5.66 µs ± 44.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops 
each)
    %timeit na.c_array(floats_lst, na.float64())
    #> 104 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    %timeit na.c_array(iter(floats_lst), na.float64())
    #> 107 ms ± 202 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
    ```
    
    It should be noted that there is probably one more PR on top of this to
    support building variable-length string/binary arrays (and possibly move
    some of the building code out of `c_lib.py` since it is getting a little
    crowded there).
---
 python/src/nanoarrow/_lib.pyx | 231 ++++++++++++++++++++++++++++++++++++++++--
 python/src/nanoarrow/c_lib.py |  61 +++++++++--
 python/tests/test_c_array.py  |  73 ++++++++++++-
 python/tests/test_c_buffer.py |  21 +++-
 python/tests/test_iterator.py |   9 +-
 5 files changed, 363 insertions(+), 32 deletions(-)

diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index d9bda9d9..463f07ce 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -42,6 +42,7 @@ from cpython cimport (
     PyObject_GetBuffer,
     PyBuffer_Release,
     PyBuffer_ToContiguous,
+    PyBuffer_FillInfo,
     PyBUF_ANY_CONTIGUOUS,
     PyBUF_FORMAT,
     PyBUF_WRITABLE
@@ -1856,34 +1857,103 @@ cdef class CBuffer:
 
 
 cdef class CBufferBuilder:
-    """Wrapper around writable owned buffer CPU content"""
+    """Wrapper around writable CPU buffer content
+
+    This class provides a growable type-aware buffer that can be used
+    to create a typed buffer from a Python iterable. This method of
+    creating a buffer is usually slower than constructors like
+    ``array.array()`` or ``numpy.array()``; however, this class supports
+    all Arrow types with a single data buffer (e.g., boolean bitmaps,
+    float16, intervals, fixed-size binary), some of which are not supported
+    by other constructors.
+    """
     cdef CBuffer _buffer
+    cdef bint _locked
 
     def __cinit__(self):
         self._buffer = CBuffer.empty()
+        self._locked = False
+
+    cdef _assert_unlocked(self):
+        if self._locked:
+            raise BufferError("CBufferBuilder is locked")
+
+    # Implement the buffer protocol so that this object can be used as
+    # the argument to Struct.readinto() (or perhaps written to by
+    # an independent library).
+    def __getbuffer__(self, Py_buffer* buffer, int flags):
+        self._assert_unlocked()
+        PyBuffer_FillInfo(
+            buffer,
+            self,
+            self._buffer._ptr.data,
+            self._buffer._ptr.capacity_bytes,
+            0,
+            flags
+        )
+        self._locked = True
+
+    def __releasebuffer__(self, Py_buffer* buffer):
+        self._locked = False
+
+    def set_data_type(self, ArrowType type_id, int element_size_bits=0):
+        """Set the data type used to interpret elements in 
:meth:`write_elements`."""
+        self._buffer._set_data_type(type_id, element_size_bits)
+        return self
+
+    @property
+    def format(self):
+        """The ``struct`` format code of the underlying buffer"""
+        return self._buffer._format.decode()
 
     @property
     def size_bytes(self):
+        """The number of bytes that have been written to this buffer"""
         return self._buffer.size_bytes
 
     @property
     def capacity_bytes(self):
+        """The number of bytes allocated in the underlying buffer"""
         return self._buffer._ptr.capacity_bytes
 
-    def set_data_type(self, ArrowType type_id, int element_size_bits=0):
-        self._buffer._set_data_type(type_id, element_size_bits)
-        return self
-
     def reserve_bytes(self, int64_t additional_bytes):
+        """Ensure that the underlying buffer has space for ``additional_bytes``
+        more bytes to be written"""
+        self._assert_unlocked()
         cdef int code = ArrowBufferReserve(self._buffer._ptr, additional_bytes)
         Error.raise_error_not_ok("ArrowBufferReserve()", code)
         return self
 
+    def advance(self, int64_t additional_bytes):
+        """Manually increase :attr:`size_bytes` by ``additional_bytes``
+
+        This can be used after writing to the buffer using the buffer protocol
+        to ensure that :attr:`size_bytes` accurately reflects the number of
+        bytes written to the buffer.
+        """
+        cdef int64_t new_size = self._buffer._ptr.size_bytes + additional_bytes
+        if new_size < 0 or new_size > self._buffer._ptr.capacity_bytes:
+            raise IndexError(f"Can't advance {additional_bytes} from 
{self.size_bytes}")
+
+        self._buffer._ptr.size_bytes = new_size
+        return self
+
     def write(self, content):
+        """Write bytes to this buffer
+
+        Writes the bytes of ``content`` without considering the element type of
+        ``content`` or the element type of this buffer.
+
+        This method returns the number of bytes that were written.
+        """
+        self._assert_unlocked()
+
         cdef Py_buffer buffer
         cdef int64_t out
         PyObject_GetBuffer(content, &buffer, PyBUF_ANY_CONTIGUOUS)
 
+        # TODO: Check for single dimension?
+
         cdef int code = ArrowBufferReserve(self._buffer._ptr, buffer.len)
         if code != NANOARROW_OK:
             PyBuffer_Release(&buffer)
@@ -1904,23 +1974,58 @@ cdef class CBufferBuilder:
         return out
 
     def write_elements(self, obj):
+        """"Write an iterable of elements to this buffer
+
+        Writes the elements of iterable ``obj`` according to the binary
+        representation specified by :attr:`format`. This is currently
+        powered by ``struct.pack_into()`` except when building bitmaps
+        where an internal implementation is used.
+
+        This method returns the number of elements that were written.
+        """
+        self._assert_unlocked()
+
+        # Boolean arrays need their own writer since Python provides
+        # no helpers for bitpacking
         if self._buffer._data_type == NANOARROW_TYPE_BOOL:
             return self._write_bits(obj)
 
         cdef int64_t n_values = 0
+        cdef int64_t bytes_per_element = calcsize(self._buffer.format)
+        cdef int code
+
         struct_obj = Struct(self._buffer._format)
-        pack = struct_obj.pack
-        write = self.write
+        pack_into = struct_obj.pack_into
 
+        # If an object has a length, we can avoid extra allocations
+        if hasattr(obj, "__len__"):
+            code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element * 
len(obj))
+            Error.raise_error_not_ok("ArrowBufferReserve()", code)
+
+        # Types whose Python representation is a tuple need a slightly 
different
+        # pack_into() call. if code != NANOARROW_OK is used instead of
+        # Error.raise_error_not_ok() Cython can avoid the extra function call
+        # and this is a very tight loop.
         if self._buffer._data_type in (NANOARROW_TYPE_INTERVAL_DAY_TIME,
-                               NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO):
+                                       NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO):
             for item in obj:
+                code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element)
+                if code != NANOARROW_OK:
+                    Error.raise_error("ArrowBufferReserve()", code)
+
+                pack_into(self, self._buffer._ptr.size_bytes, *item)
+                self._buffer._ptr.size_bytes += bytes_per_element
                 n_values += 1
-                write(pack(*item))
+
         else:
             for item in obj:
+                code = ArrowBufferReserve(self._buffer._ptr, bytes_per_element)
+                if code != NANOARROW_OK:
+                    Error.raise_error("ArrowBufferReserve()", code)
+
+                pack_into(self, self._buffer._ptr.size_bytes, item)
+                self._buffer._ptr.size_bytes += bytes_per_element
                 n_values += 1
-                write(pack(item))
 
         return n_values
 
@@ -1951,6 +2056,15 @@ cdef class CBufferBuilder:
         return n_values
 
     def finish(self):
+        """Finish building this buffer
+
+        Performs any steps required to finish building this buffer and
+        returns the result. Any behaviour resulting from calling methods
+        on this object after it has been finished is not currently
+        defined (but should not crash).
+        """
+        self._assert_unlocked()
+
         cdef CBuffer out = self._buffer
         out._populate_view()
 
@@ -1962,6 +2076,103 @@ cdef class CBufferBuilder:
         return f"{class_label}({self.size_bytes}/{self.capacity_bytes})"
 
 
+cdef class NoneAwareWrapperIterator:
+    """Nullable iterator wrapper
+
+    This class wraps an iterable ``obj`` that might contain ``None`` values
+    such that the iterable provided by this class contains "empty" (but valid)
+    values. After ``obj`` has been completely consumed, one can call
+    ``finish()`` to obtain the resulting bitmap. This is useful for passing
+    iterables that might contain None to tools that cannot handle them
+    (e.g., struct.pack(), array.array()).
+    """
+    cdef ArrowBitmap _bitmap
+    cdef object _obj
+    cdef object _value_if_none
+    cdef int64_t _valid_count
+    cdef int64_t _item_count
+
+    def __cinit__(self, obj, type_id, item_size_bytes=0):
+        ArrowBitmapInit(&self._bitmap)
+        self._obj = iter(obj)
+
+        self._value_if_none = self._get_value_if_none(type_id, item_size_bytes)
+        self._valid_count = 0
+        self._item_count = 0
+
+    def __dealloc__(self):
+        ArrowBitmapReset(&self._bitmap)
+
+    def reserve(self, int64_t additional_elements):
+        cdef int code = ArrowBitmapReserve(&self._bitmap, additional_elements)
+        Error.raise_error_not_ok(self, code)
+
+    def _get_value_if_none(self, type_id, item_size_bytes=0):
+        if type_id == NANOARROW_TYPE_INTERVAL_DAY_TIME:
+            return (0, 0)
+        elif type_id == NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
+            return (0, 0, 0)
+        elif type_id == NANOARROW_TYPE_BOOL:
+            return False
+        elif type_id  in (NANOARROW_TYPE_BINARY, 
NANOARROW_TYPE_FIXED_SIZE_BINARY):
+            return b"\x00" * item_size_bytes
+        elif type_id in (NANOARROW_TYPE_HALF_FLOAT, NANOARROW_TYPE_FLOAT, 
NANOARROW_TYPE_DOUBLE):
+            return 0.0
+        else:
+            return 0
+
+    cdef _append_to_validity(self, int is_valid):
+        self._valid_count += is_valid
+        self._item_count += 1
+
+        # Avoid allocating a bitmap if all values seen so far are valid
+        if self._valid_count == self._item_count:
+            return
+
+        # If the bitmap hasn't been allocated yet, allocate it now and
+        # fill with 1s for all previous elements.
+        cdef int code
+        if self._bitmap.size_bits == 0 and self._item_count > 1:
+            code = ArrowBitmapAppend(&self._bitmap, 1, self._item_count - 1)
+            if code != NANOARROW_OK:
+                Error.raise_error("ArrowBitmapAppend()", code)
+
+        # Append this element to the bitmap
+        code = ArrowBitmapAppend(&self._bitmap, is_valid, 1)
+        if code != NANOARROW_OK:
+            Error.raise_error("ArrowBitmapAppend()", code)
+
+    def __iter__(self):
+        for item in self._obj:
+            if item is None:
+                self._append_to_validity(0)
+                yield self._value_if_none
+            else:
+                self._append_to_validity(1)
+                yield item
+
+    def finish(self):
+        """Obtain the total count, null count, and validity bitmap after
+        consuming this iterable."""
+        cdef CBuffer validity
+        null_count = self._item_count - self._valid_count
+
+        # If we did allocate a bitmap, make sure the last few bits are zeroed
+        if null_count > 0 and self._bitmap.size_bits % 8 != 0:
+            ArrowBitmapAppendUnsafe(&self._bitmap, 0, self._bitmap.size_bits % 
8)
+
+        if null_count > 0:
+            validity = CBuffer.empty()
+            ArrowBufferMove(&self._bitmap.buffer, validity._ptr)
+            validity._set_data_type(NANOARROW_TYPE_BOOL)
+
+        return (
+            self._item_count,
+            null_count,
+            validity if null_count > 0 else None
+        )
+
+
 cdef class CArrayBuilder:
     cdef CArray c_array
     cdef ArrowArray* _ptr
diff --git a/python/src/nanoarrow/c_lib.py b/python/src/nanoarrow/c_lib.py
index 0acc0a9d..3dd85d29 100644
--- a/python/src/nanoarrow/c_lib.py
+++ b/python/src/nanoarrow/c_lib.py
@@ -38,6 +38,7 @@ from nanoarrow._lib import (
     CSchema,
     CSchemaBuilder,
     CSchemaView,
+    NoneAwareWrapperIterator,
     _obj_is_buffer,
     _obj_is_capsule,
 )
@@ -364,8 +365,8 @@ def c_array_stream(obj=None, schema=None) -> CArrayStream:
         return CArrayStream.from_array_list([array], array.schema, 
validate=False)
     except Exception as e:
         raise TypeError(
-            f"Can't convert object of type {type(obj).__name__} "
-            "to nanoarrow.c_array_stream or nanoarrow.c_array"
+            f"An error occurred whilst converting {type(obj).__name__} "
+            f"to nanoarrow.c_array_stream or nanoarrow.c_array: \n {e}"
         ) from e
 
 
@@ -602,7 +603,7 @@ def _c_array_from_pybuffer(obj) -> CArray:
     return builder.finish()
 
 
-def _c_array_from_iterable(obj, schema=None):
+def _c_array_from_iterable(obj, schema=None) -> CArray:
     if schema is None:
         raise ValueError("schema is required for CArray import from iterable")
 
@@ -618,28 +619,66 @@ def _c_array_from_iterable(obj, schema=None):
         builder.start_appending()
         return builder.finish()
 
-    # Use buffer create for crude support of array from iterable
-    buffer, n_values = _c_buffer_from_iterable(obj, schema)
+    # We need to know a few things about the data type to choose the 
appropriate
+    # strategy for building the array.
+    schema_view = c_schema_view(schema)
+
+    if schema_view.storage_type_id != schema_view.type_id:
+        raise ValueError(
+            f"Can't create array from iterable for type {schema_view.type}"
+        )
+
+    # Creating a buffer from an iterable does not handle None values,
+    # but we can do so here with the NoneAwareWrapperIterator() wrapper.
+    # This approach is quite a bit slower, so only do it for a nullable
+    # type.
+    if schema_view.nullable:
+        obj_wrapper = NoneAwareWrapperIterator(
+            obj, schema_view.storage_type_id, schema_view.fixed_size
+        )
+
+        if obj_len > 0:
+            obj_wrapper.reserve(obj_len)
+
+        data, _ = _c_buffer_from_iterable(obj_wrapper, schema_view)
+        n_values, null_count, validity = obj_wrapper.finish()
+    else:
+        data, n_values = _c_buffer_from_iterable(obj, schema_view)
+        null_count = 0
+        validity = None
 
     return c_array_from_buffers(
-        schema, n_values, buffers=(None, buffer), null_count=0, move=True
+        schema, n_values, buffers=(validity, data), null_count=null_count, 
move=True
     )
 
 
 def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
+    import array
+
     if schema is None:
         raise ValueError("CBuffer from iterable requires schema")
 
-    builder = CBufferBuilder()
-
     schema_view = c_schema_view(schema)
     if schema_view.storage_type_id != schema_view.type_id:
-        raise ValueError(f"Can't create buffer from type {schema}")
+        raise ValueError(
+            f"Can't create buffer from iterable for type {schema_view.type}"
+        )
+
+    builder = CBufferBuilder()
 
     if schema_view.storage_type_id == CArrowType.FIXED_SIZE_BINARY:
         builder.set_data_type(CArrowType.BINARY, schema_view.fixed_size * 8)
     else:
         builder.set_data_type(schema_view.storage_type_id)
 
-    n_values_written = builder.write_elements(obj)
-    return builder.finish(), n_values_written
+    # If we are using a typecode supported by the array module, it has much
+    # faster implementations of safely building buffers from iterables
+    if (
+        builder.format in array.typecodes
+        and schema_view.storage_type_id != CArrowType.BOOL
+    ):
+        buf = array.array(builder.format, obj)
+        return CBuffer.from_pybuffer(buf), len(buf)
+
+    n_values = builder.write_elements(obj)
+    return builder.finish(), n_values
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index 1536d159..1511196f 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -219,15 +219,86 @@ def test_c_array_from_iterable_empty():
     assert len(array_view.buffer(2)) == 0
 
 
-def test_c_array_from_iterable_non_empty():
+def test_c_array_from_iterable_non_empty_nullable_without_nulls():
     c_array = na.c_array([1, 2, 3], na.int32())
     assert c_array.length == 3
     assert c_array.null_count == 0
 
     view = na.c_array_view(c_array)
+    assert list(view.buffer(0)) == []
     assert list(view.buffer(1)) == [1, 2, 3]
 
 
+def test_c_array_from_iterable_non_empty_non_nullable():
+    c_array = na.c_array([1, 2, 3], na.int32(nullable=False))
+    assert c_array.length == 3
+    assert c_array.null_count == 0
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0)) == []
+    assert list(view.buffer(1)) == [1, 2, 3]
+
+
+def test_c_array_from_iterable_int_with_nulls():
+    c_array = na.c_array([1, None, 3], na.int32())
+    assert c_array.length == 3
+    assert c_array.null_count == 1
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0).elements()) == [True, False, True] + [False] * 5
+    assert list(view.buffer(1)) == [1, 0, 3]
+
+
+def test_c_array_from_iterable_float_with_nulls():
+    c_array = na.c_array([1, None, 3], na.float64())
+    assert c_array.length == 3
+    assert c_array.null_count == 1
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0).elements()) == [True, False, True] + [False] * 5
+    assert list(view.buffer(1)) == [1.0, 0.0, 3.0]
+
+
+def test_c_array_from_iterable_bool_with_nulls():
+    c_array = na.c_array([True, None, False], na.bool())
+    assert c_array.length == 3
+    assert c_array.null_count == 1
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0).elements()) == [True, False, True] + [False] * 5
+    assert list(view.buffer(1).elements()) == [True, False, False] + [False] * 
5
+
+
+def test_c_array_from_iterable_fixed_size_binary_with_nulls():
+    c_array = na.c_array([b"1234", None, b"5678"], na.fixed_size_binary(4))
+    assert c_array.length == 3
+    assert c_array.null_count == 1
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0).elements()) == [True, False, True] + [False] * 5
+    assert list(view.buffer(1)) == [b"1234", b"\x00\x00\x00\x00", b"5678"]
+
+
+def test_c_array_from_iterable_day_time_interval_with_nulls():
+    c_array = na.c_array([(1, 2), None, (3, 4)], na.interval_day_time())
+    assert c_array.length == 3
+    assert c_array.null_count == 1
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0).elements()) == [True, False, True] + [False] * 5
+    assert list(view.buffer(1)) == [(1, 2), (0, 0), (3, 4)]
+
+
+def test_c_array_from_iterable_month_day_nano_interval_with_nulls():
+    c_array = na.c_array([(1, 2, 3), None, (4, 5, 6)], 
na.interval_month_day_nano())
+    assert c_array.length == 3
+    assert c_array.null_count == 1
+
+    view = na.c_array_view(c_array)
+    assert list(view.buffer(0).elements()) == [True, False, True] + [False] * 5
+    assert list(view.buffer(1)) == [(1, 2, 3), (0, 0, 0), (4, 5, 6)]
+
+
 def test_c_array_from_iterable_error():
     with pytest.raises(ValueError, match="schema is required"):
         na.c_array([1, 2, 3])
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 51944723..38bb0c62 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -214,7 +214,24 @@ def test_c_buffer_builder():
     assert builder.size_bytes == 10
     assert builder.capacity_bytes == 123
 
-    assert bytes(builder.finish()) == b"abcdefghij"
+    with pytest.raises(IndexError):
+        builder.advance(-11)
+
+    with pytest.raises(IndexError):
+        builder.advance(114)
+
+    mv = memoryview(builder)
+    with pytest.raises(BufferError, match="CBufferBuilder is locked"):
+        memoryview(builder)
+
+    with pytest.raises(BufferError, match="CBufferBuilder is locked"):
+        assert bytes(builder.finish()) == b"abcdefghij"
+
+    mv[builder.size_bytes] = ord("k")
+    builder.advance(1)
+
+    del mv
+    assert bytes(builder.finish()) == b"abcdefghijk"
 
 
 def test_c_buffer_from_iterable():
@@ -231,7 +248,7 @@ def test_c_buffer_from_iterable():
 
     # An Arrow type whose storage type is not the same as its top-level
     # type will error.
-    with pytest.raises(ValueError, match="Can't create buffer from type"):
+    with pytest.raises(ValueError, match="Can't create buffer"):
         na.c_buffer([1, 2, 3], na.date32())
 
 
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 8cbd66de..5b9c8103 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -50,14 +50,7 @@ def test_iterator_primitive():
 
 
 def test_iterator_nullable_primitive():
-    array = na.c_array_from_buffers(
-        na.int32(),
-        4,
-        buffers=[
-            na.c_buffer([1, 1, 1, 0], na.bool()),
-            na.c_buffer([1, 2, 3, 0], na.int32()),
-        ],
-    )
+    array = na.c_array([1, 2, 3, None], na.int32())
     assert list(iter_py(array)) == [1, 2, 3, None]
 
     sliced = array[1:]

(arrow-nanoarrow) branch main updated: chore(python): Restructure buffer packing to support nulls and improve performance (#426)

Reply via email to