(arrow-nanoarrow) branch main updated: feat(python): Add user-facing `Array` class (#396)

paleolimbot Thu, 21 Mar 2024 12:02:29 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 7af6dfff feat(python): Add user-facing `Array` class (#396)
7af6dfff is described below

commit 7af6dfff2a194a9b0b599a4fd7ed9d4f3ff5f650
Author: Dewey Dunnington <de...@dunnington.ca>
AuthorDate: Thu Mar 21 16:00:29 2024 -0300

    feat(python): Add user-facing `Array` class (#396)
    
    This PR implements the `nanoarrow.Array` which basically a
    `pyarrow.ChunkedArray`. This can represent a `Table`, `RecordBatch`,
    `ChunkedArray`, and `Array`. It doesn't quite play nicely with pyarrow's
    ChunkedArray (but will after the next release, since
    `__arrow_c_stream__` was just added there).
    
    The user-facing class is backed by a Cython class, the
    `CMaterializedArrayStream`, which manages some of the c-level details
    like resolving a chunk + offset when there is more than one chunk in the
    array. An early version of this PR implemented the
    `CMaterializedArrayStream` using C pointers (e.g., `ArrowArray*
    arrays`), but I decided that was to complex and went back to
    `List[CArray]`. I think this is also better for managing ownership
    (e.g., more unneeded `CArray` instances can be released by the garbage
    collector).
    
    The `Array` class as implemented here is device-aware, although until we
    have non-CPU support it's difficult to test this. The methods I added
    here are basically stubs just to demonstrate the intention.
    
    This PR also implements the `Scalar`, whose main purpose for testing and
    other non-performance sensitive things (like lazier reprs for very large
    items or interactive inspection). They may also be useful for working
    with arrays that contain elements with very long strings or large arrays
    (e.g., geometry).
    
    I also added some basic accessors like `buffer()`, `child()`, and some
    ways one might want to iterate over an `Array` to make the utility of
    this class more clear.
    
    Basic usage:
    
    ```python
    import nanoarrow as na
    
    na.Array(range(100), na.int64())
    ```
    
    ```
    nanoarrow.Array<int64>[100]
    0
    1
    2
    3
    4
    5
    6
    7
    8
    9
    ...and 90 more items
    ```
    
    More involved example reading from an IPC stream:
    
    ```python
    import nanoarrow as na
    from nanoarrow.ipc import Stream
    
    url = 
"https://github.com/apache/arrow-testing/raw/master/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_primitive.stream";
    
    with Stream.from_url(url) as inp:
        array = na.Array(inp)
    
    array.child(25)
    ```
    
    ```
    nanoarrow.Array<string>[37]
    'co矢2p矢m'
    'wÂ€acrd'
    'kjd1dlô'
    'pib矢d5w'
    '6nnpwôg'
    'ndj£h£4'
    'ôôf4aµg'
    'kwÂh£fr'
    '°g5dk€e'
    'r€cbmdn'
    ...and 27 more items
    ```
    
    ---------
    
    Co-authored-by: Joris Van den Bossche <jorisvandenboss...@gmail.com>
---
 python/src/nanoarrow/__init__.py    |   2 +
 python/src/nanoarrow/_ipc_lib.pyx   |  98 +++----
 python/src/nanoarrow/_lib.pyx       | 126 ++++++++-
 python/src/nanoarrow/_repr_utils.py |   9 +
 python/src/nanoarrow/array.py       | 498 ++++++++++++++++++++++++++++++++++++
 python/src/nanoarrow/ipc.py         |   4 +-
 python/src/nanoarrow/iterator.py    |  24 +-
 python/tests/test_array.py          | 282 ++++++++++++++++++++
 python/tests/test_c_array.py        |   2 +-
 python/tests/test_iterator.py       |  98 +++----
 src/nanoarrow/buffer_inline.h       |  22 ++
 src/nanoarrow/nanoarrow.h           |   8 +
 src/nanoarrow/utils_test.cc         |  12 +
 13 files changed, 1077 insertions(+), 108 deletions(-)

diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 7ae8ec99..5f99dc22 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -73,6 +73,7 @@ from nanoarrow.schema import (
     decimal256,
     struct,
 )
+from nanoarrow.array import Array
 from nanoarrow._version import __version__  # noqa: F401
 
 # Helps Sphinx automatically populate an API reference section
@@ -123,4 +124,5 @@ __all__ = [
     "uint32",
     "uint64",
     "uint8",
+    "Array",
 ]
diff --git a/python/src/nanoarrow/_ipc_lib.pyx 
b/python/src/nanoarrow/_ipc_lib.pyx
index 615ee10c..2db8e003 100644
--- a/python/src/nanoarrow/_ipc_lib.pyx
+++ b/python/src/nanoarrow/_ipc_lib.pyx
@@ -50,22 +50,32 @@ cdef extern from "nanoarrow_ipc.h" nogil:
 
 
 cdef class PyInputStreamPrivate:
-    cdef object obj
-    cdef object obj_method
-    cdef void* addr
-    cdef Py_ssize_t size_bytes
-    cdef int close_stream
-
-    def __cinit__(self, obj, close_stream=False):
-        self.obj = obj
-        self.obj_method = obj.readinto
-        self.addr = NULL
-        self.size_bytes = 0
-        self.close_stream = close_stream
+    cdef object _obj
+    cdef bint _close_obj
+    cdef void* _addr
+    cdef Py_ssize_t _size_bytes
+
+    def __cinit__(self, obj, close_obj=False):
+        self._obj = obj
+        self._close_obj = close_obj
+        self._addr = NULL
+        self._size_bytes = 0
+
+    @property
+    def obj(self):
+        return self._obj
+
+    @property
+    def close_obj(self):
+        return self._close_obj
+
+    def set_buffer(self, uintptr_t addr, Py_ssize_t size_bytes):
+        self._addr = <void*>addr
+        self._size_bytes = size_bytes
 
     # Needed for at least some implementations of readinto()
     def __len__(self):
-        return self.size_bytes
+        return self._size_bytes
 
     # Implement the buffer protocol so that this object can be used as
     # the argument to xxx.readinto(). This ensures that no extra copies
@@ -75,7 +85,7 @@ cdef class PyInputStreamPrivate:
     # implementation before issuing each read call (two per message, with
     # an extra call for a RecordBatch message to get the actual buffer data).
     def __getbuffer__(self, Py_buffer* buffer, int flags):
-        PyBuffer_FillInfo(buffer, self, self.addr, self.size_bytes, 0, flags)
+        PyBuffer_FillInfo(buffer, self, self._addr, self._size_bytes, 0, flags)
 
     def __releasebuffer__(self, Py_buffer* buffer):
         pass
@@ -83,33 +93,35 @@ cdef class PyInputStreamPrivate:
 
 cdef ArrowErrorCode py_input_stream_read(ArrowIpcInputStream* stream, uint8_t* 
buf,
                                          int64_t buf_size_bytes, int64_t* 
size_read_out,
-                                         ArrowError* error) noexcept:
-    cdef PyInputStreamPrivate stream_private = <object>stream.private_data
-    stream_private.addr = buf
-    stream_private.size_bytes = buf_size_bytes
-
-    try:
-        size_read_out[0] = stream_private.obj_method(stream_private)
-        return NANOARROW_OK
-    except Exception as e:
-        cls = type(e).__name__.encode()
-        msg = str(e).encode()
-        snprintf(
-            error.message,
-            sizeof(error.message),
-            "%s: %s",
-            <const char*>cls,
-            <const char*>msg
-        )
-        return EIO
-
-
-cdef void py_input_stream_release(ArrowIpcInputStream* stream) noexcept:
-    cdef PyInputStreamPrivate stream_private = <object>stream.private_data
-    if stream_private.close_stream:
-        stream_private.obj.close()
-
-    Py_DECREF(stream_private)
+                                         ArrowError* error) noexcept nogil:
+
+    with gil:
+        stream_private = <object>stream.private_data
+        stream_private.set_buffer(<uintptr_t>buf, buf_size_bytes)
+
+        try:
+            size_read_out[0] = stream_private.obj.readinto(stream_private)
+            return NANOARROW_OK
+        except Exception as e:
+            cls = type(e).__name__.encode()
+            msg = str(e).encode()
+            snprintf(
+                error.message,
+                sizeof(error.message),
+                "%s: %s",
+                <const char*>cls,
+                <const char*>msg
+            )
+            return EIO
+
+cdef void py_input_stream_release(ArrowIpcInputStream* stream) noexcept nogil:
+    with gil:
+        stream_private = <object>stream.private_data
+        if stream_private.close_obj:
+            stream_private.obj.close()
+
+        Py_DECREF(stream_private)
+
     stream.private_data = NULL
     stream.release = NULL
 
@@ -136,9 +148,9 @@ cdef class CIpcInputStream:
             return False
 
     @staticmethod
-    def from_readable(obj, close_stream=False):
+    def from_readable(obj, close_obj=False):
         cdef CIpcInputStream stream = CIpcInputStream()
-        cdef PyInputStreamPrivate private_data = PyInputStreamPrivate(obj, 
close_stream)
+        cdef PyInputStreamPrivate private_data = PyInputStreamPrivate(obj, 
close_obj)
 
         stream._stream.private_data = <PyObject*>private_data
         Py_INCREF(private_data)
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 2877feca..a83e029c 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -34,6 +34,7 @@ generally have better autocomplete + documentation available 
to IDEs).
 from libc.stdint cimport uintptr_t, uint8_t, int64_t
 from libc.string cimport memcpy
 from libc.stdio cimport snprintf
+from libc.errno cimport ENOMEM
 from cpython.bytes cimport PyBytes_FromStringAndSize
 from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer, 
PyCapsule_IsValid
 from cpython cimport (
@@ -1066,10 +1067,11 @@ cdef class CArray:
         return out
 
     def __getitem__(self, k):
+        self._assert_valid()
+
         if not isinstance(k, slice):
             raise TypeError(
-                f"Can't slice CArray with object of type {type(k).__name__}"
-            )
+                f"Can't subset CArray with object of type {type(k).__name__}")
 
         if k.step is not None:
             raise ValueError("Can't slice CArray with step")
@@ -2198,6 +2200,126 @@ cdef class CArrayStream:
         return _repr_utils.array_stream_repr(self)
 
 
+cdef class CMaterializedArrayStream:
+    cdef CSchema _schema
+    cdef CBuffer _array_ends
+    cdef list _arrays
+    cdef int64_t _total_length
+
+    def __cinit__(self):
+        self._arrays = []
+        self._total_length = 0
+        self._schema = CSchema.allocate()
+        self._array_ends = CBuffer.empty()
+        cdef int code = ArrowBufferAppendInt64(self._array_ends._ptr, 0)
+        Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+
+    cdef _finalize(self):
+        self._array_ends._set_data_type(NANOARROW_TYPE_INT64)
+
+    @property
+    def schema(self):
+        return self._schema
+
+    def __getitem__(self, k):
+        cdef int64_t kint
+        cdef int array_i
+        cdef const int64_t* sorted_offsets = 
<int64_t*>self._array_ends._ptr.data
+
+        if isinstance(k, slice):
+            raise NotImplementedError("index with slice")
+
+        kint = k
+        if kint < 0:
+            kint += self._total_length
+        if kint < 0 or kint >= self._total_length:
+            raise IndexError(f"Index {kint} is out of range")
+
+        array_i = ArrowResolveChunk64(kint, sorted_offsets, 0, 
len(self._arrays))
+        kint -= sorted_offsets[array_i]
+        return self._arrays[array_i], kint
+
+    def __len__(self):
+        return self._array_ends[len(self._arrays)]
+
+    def __iter__(self):
+        for c_array in self._arrays:
+            for item_i in range(c_array.length):
+                yield c_array, item_i
+
+    def array(self, int64_t i):
+        return self._arrays[i]
+
+    @property
+    def n_arrays(self):
+        return len(self._arrays)
+
+    @property
+    def arrays(self):
+        return iter(self._arrays)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        # When an array stream from iterable is supported, that could be used 
here
+        # to avoid unnessary shallow copies.
+        stream = CArrayStream.from_array_list(self._arrays, self._schema, 
move=False)
+        return stream.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def child(self, int64_t i):
+        cdef CMaterializedArrayStream out = CMaterializedArrayStream()
+        cdef int code
+
+        out._schema = self._schema.child(i)
+        out._arrays = [chunk.child(i) for chunk in self._arrays]
+        for child_chunk in out._arrays:
+            out._total_length += child_chunk.length
+            code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
+            Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+
+        out._finalize()
+        return out
+
+    @staticmethod
+    def from_c_array(CArray array):
+        array._assert_valid()
+
+        cdef CMaterializedArrayStream out = CMaterializedArrayStream()
+        out._schema = array._schema
+
+        if array._ptr.length == 0:
+            out._finalize()
+            return out
+
+        out._arrays.append(array)
+        out._total_length += array._ptr.length
+        cdef int code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
+        Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+
+        out._finalize()
+        return out
+
+    @staticmethod
+    def from_c_array_stream(CArrayStream stream):
+        stream._assert_valid()
+        cdef CMaterializedArrayStream out = CMaterializedArrayStream()
+        cdef int code
+        cdef CArray array
+
+        with stream:
+            for array in stream:
+                if array._ptr.length == 0:
+                    continue
+
+                out._total_length += array._ptr.length
+                code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
+                Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+                out._arrays.append(array)
+
+            out._schema = stream._get_cached_schema()
+
+        out._finalize()
+        return out
+
+
 cdef class CDeviceArray:
     cdef object _base
     cdef ArrowDeviceArray* _ptr
diff --git a/python/src/nanoarrow/_repr_utils.py 
b/python/src/nanoarrow/_repr_utils.py
index 26a274aa..99b11fde 100644
--- a/python/src/nanoarrow/_repr_utils.py
+++ b/python/src/nanoarrow/_repr_utils.py
@@ -26,6 +26,15 @@ def make_class_label(obj, module=None):
     return f"{module}.{obj.__class__.__name__}"
 
 
+def c_schema_to_string(obj, max_char_width=80):
+    max_char_width = max(max_char_width, 10)
+    c_schema_string = obj._to_string(recursive=True, max_chars=max_char_width 
+ 1)
+    if len(c_schema_string) > max_char_width:
+        return c_schema_string[: (max_char_width - 3)] + "..."
+    else:
+        return c_schema_string
+
+
 def schema_repr(schema, indent=0):
     indent_str = " " * indent
     class_label = make_class_label(schema, module="nanoarrow.c_lib")
diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py
new file mode 100644
index 00000000..78756e15
--- /dev/null
+++ b/python/src/nanoarrow/array.py
@@ -0,0 +1,498 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from functools import cached_property
+from typing import Iterable, Tuple
+
+from nanoarrow._lib import (
+    CDEVICE_CPU,
+    CArray,
+    CBuffer,
+    CDevice,
+    CMaterializedArrayStream,
+)
+from nanoarrow.c_lib import c_array, c_array_stream, c_array_view
+from nanoarrow.iterator import iter_py, iter_tuples
+from nanoarrow.schema import Schema
+
+from nanoarrow import _repr_utils
+
+
+class Scalar:
+    """Generic wrapper around an :class:`Array` element
+
+    This class exists to provide a generic implementation of
+    array-like indexing for the :class:`Array`. These objects
+    can currently only be created by extracting an element from
+    an :class:`Array`.
+
+    Note that it is rarely efficient to iterate over Scalar objects:
+    use the iterators in :mod:`nanoarrow.iterator` to more effectively
+    iterate over an :class:`Array`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> array = na.Array([1, 2, 3], na.int32())
+    >>> array[0]
+    Scalar<int32> 1
+    >>> array[0].as_py()
+    1
+    >>> array[0].schema
+    Schema(INT32)
+    """
+
+    def __init__(self):
+        # Private constructor
+        self._c_array = None
+        self._offset = None
+        self._schema = None
+        self._device = None
+
+    @property
+    def device(self) -> CDevice:
+        return self._device
+
+    @property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this scalar"""
+        return self._schema
+
+    def as_py(self):
+        """Get the Python object representation of this scalar"""
+        return next(iter_py(self))
+
+    def to_string(self, width_hint=80) -> str:
+        c_schema_string = _repr_utils.c_schema_to_string(
+            self._c_array.schema, width_hint // 4
+        )
+
+        prefix = f"Scalar<{c_schema_string}> "
+        width_hint -= len(prefix)
+
+        py_repr = repr(self.as_py())
+        if len(py_repr) > width_hint:
+            py_repr = py_repr[: (width_hint - 3)] + "..."
+        return f"{prefix}{py_repr}"
+
+    def __repr__(self) -> str:
+        return self.to_string()
+
+    def __arrow_c_array__(self, requested_schema=None):
+        array = self._c_array[self._offset : (self._offset + 1)]
+        return array.__arrow_c_array__(requested_schema=requested_schema)
+
+
+class Array:
+    """High-level in-memory Array representation
+
+    The Array is nanoarrow's high-level in-memory array representation whose
+    scope maps to that of a fully-consumed ArrowArrayStream in the Arrow C Data
+    interface. See :func:`array` for class details.
+
+    The :class:`Array` class is nanoarrow's high-level in-memory array
+    representation, encompasing the role of PyArrow's ``Array``,
+    ``ChunkedArray``, ``RecordBatch``, and ``Table``. This scope maps
+    to that of a fully-consumed ``ArrowArrayStream`` as represented by
+    the Arrow C Stream interface.
+
+    Note that an :class:`Array` is not necessarily contiguous in memory (i.e.,
+    it may consist of zero or more ``ArrowArray``s).
+
+    Parameters
+    ----------
+    obj : array or array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+    device : CDevice, optional
+        The device associated with the buffers held by this Array.
+        Defaults to the CPU device.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> na.Array([1, 2, 3], na.int32())
+    nanoarrow.Array<int32>[3]
+    1
+    2
+    3
+    """
+
+    def __init__(self, obj, schema=None, device=None) -> None:
+        if device is None:
+            self._device = CDEVICE_CPU
+        elif isinstance(device, CDevice):
+            self._device = device
+        else:
+            raise TypeError("device must be CDevice")
+
+        if isinstance(obj, CMaterializedArrayStream) and schema is None:
+            self._data = obj
+            return
+
+        if isinstance(obj, Array) and schema is None:
+            self._data = obj._data
+            return
+
+        if isinstance(obj, CArray) and schema is None:
+            self._data = CMaterializedArrayStream.from_c_array(obj)
+            return
+
+        with c_array_stream(obj, schema=schema) as stream:
+            self._data = CMaterializedArrayStream.from_c_array_stream(stream)
+
+    def _assert_one_chunk(self, op):
+        if self._data.n_arrays != 1:
+            raise ValueError(f"Can't {op} with non-contiguous Array")
+
+    def _assert_cpu(self, op):
+        if self._device != CDEVICE_CPU:
+            raise ValueError(f"Can't {op} with Array on non-CPU device")
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        self._assert_cpu("export ArrowArrayStream")
+        return self._data.__arrow_c_stream__(requested_schema=requested_schema)
+
+    def __arrow_c_array__(self, requested_schema=None):
+        self._assert_cpu("export ArrowArray")
+
+        if self._data.n_arrays == 0:
+            return c_array([], schema=self._data.schema).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+        elif self._data.n_arrays == 1:
+            return self._data.array(0).__arrow_c_array__(
+                requested_schema=requested_schema
+            )
+
+        self._assert_one_chunk("export ArrowArray")
+
+    @property
+    def device(self) -> CDevice:
+        """Get the device on which the buffers for this array are allocated.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.device
+        <nanoarrow.device.CDevice>
+        - device_type: 1
+        - device_id: 0
+        """
+        return self._device
+
+    @cached_property
+    def schema(self) -> Schema:
+        """Get the schema (data type) of this Array"""
+        return Schema(self._data.schema)
+
+    @property
+    def n_buffers(self) -> int:
+        """Get the number of buffers in each chunk of this Array.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.n_buffers
+        2
+        """
+        return self.schema._c_schema_view.layout.n_buffers
+
+    def buffer(self, i: int) -> CBuffer:
+        """Access a single buffer of a contiguous array.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.buffer(1)
+        nanoarrow.c_lib.CBufferView(int32[12 b] 1 2 3)
+        """
+        return self.buffers[i]
+
+    @cached_property
+    def buffers(self) -> Tuple[CBuffer]:
+        """Access buffers of a contiguous array.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> for buffer in array.buffers:
+        ...     print(buffer)
+        nanoarrow.c_lib.CBufferView(bool[0 b] )
+        nanoarrow.c_lib.CBufferView(int32[12 b] 1 2 3)
+        """
+        view = c_array_view(self)
+        return tuple(view.buffers)
+
+    def iter_buffers(self) -> Iterable[Tuple[CBuffer]]:
+        """Iterate over buffers of each chunk in this Array.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> for data, validity in array.iter_buffers():
+        ...     print(data)
+        ...     print(validity)
+        nanoarrow.c_lib.CBufferView(bool[0 b] )
+        nanoarrow.c_lib.CBufferView(int32[12 b] 1 2 3)
+        """
+        # Could be more efficient using the iterator.ArrayViewIterator
+        for chunk in self.iter_chunks():
+            yield chunk.buffers
+
+    @property
+    def n_children(self) -> int:
+        """Get the number of children for an Array of this type.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch(
+        ...     [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])],
+        ...     names=["col1", "col2"]
+        ... )
+        >>> array = na.Array(batch)
+        >>> array.n_children
+        2
+        """
+        return self._data.schema.n_children
+
+    def child(self, i: int):
+        """Borrow a child Array from its parent.
+
+        Parameters
+        ----------
+        i : int
+            The index of the child to return.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch(
+        ...     [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])],
+        ...     names=["col1", "col2"]
+        ... )
+        >>> array = na.Array(batch)
+        >>> array.child(1)
+        nanoarrow.Array<string>[3]
+        'a'
+        'b'
+        'c'
+        """
+        return Array(self._data.child(i), device=self._device)
+
+    def iter_children(self) -> Iterable:
+        """Iterate over children of this Array
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch(
+        ...     [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])],
+        ...     names=["col1", "col2"]
+        ... )
+        >>> array = na.Array(batch)
+        >>> for child in array.iter_children():
+        ...     print(child)
+        nanoarrow.Array<int64>[3]
+        1
+        2
+        3
+        nanoarrow.Array<string>[3]
+        'a'
+        'b'
+        'c'
+        """
+        for i in range(self.n_children):
+            yield self.child(i)
+
+    @property
+    def n_chunks(self) -> int:
+        """Get the number of chunks in the underlying representation of this 
Array.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.n_chunks
+        1
+        """
+        return self._data.n_arrays
+
+    def chunk(self, i: int):
+        """Extract a single contiguous Array from the underlying 
representation.
+
+        Parameters
+        ----------
+        i : int
+            The index of the chunk to extract.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.chunk(0)
+        nanoarrow.Array<int32>[3]
+        1
+        2
+        3
+        """
+        return Array(self._data.array(i), device=self._device)
+
+    def iter_chunks(self) -> Iterable:
+        """Iterate over Arrays in the underlying representation whose buffers 
are
+        contiguous in memory.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> for chunk in array.iter_chunks():
+        ...     print(chunk)
+        nanoarrow.Array<int32>[3]
+        1
+        2
+        3
+        """
+        for array in self._data.arrays:
+            yield Array(array, device=self._device)
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, k) -> Scalar:
+        scalar = Scalar()
+        scalar._c_array, scalar._offset = self._data[k]
+        scalar._schema = self.schema
+        scalar._device = self._device
+        return scalar
+
+    def iter_scalar(self) -> Iterable[Scalar]:
+        """Iterate over items as Scalars
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> for item in array.iter_scalar():
+        ...     print(item)
+        Scalar<int32> 1
+        Scalar<int32> 2
+        Scalar<int32> 3
+        """
+        for carray, offset in self._data:
+            scalar = Scalar()
+            scalar._c_array = carray
+            scalar._offset = offset
+            scalar._schema = self.schema
+            scalar._device = self._device
+            yield scalar
+
+    def iter_py(self) -> Iterable:
+        """Iterate over the default Python representation of each element.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> for item in array.iter_py():
+        ...     print(item)
+        1
+        2
+        3
+        """
+        return iter_py(self)
+
+    def iter_tuples(self) -> Iterable[Tuple]:
+        """Iterate over rows of a struct array as tuples.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch(
+        ...     [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])],
+        ...     names=["col1", "col2"]
+        ... )
+        >>> array = na.Array(batch)
+        >>> for item in array.iter_tuples():
+        ...     print(item)
+        (1, 'a')
+        (2, 'b')
+        (3, 'c')
+        """
+        return iter_tuples(self)
+
+    def __iter__(self):
+        raise NotImplementedError(
+            "Use iter_scalar(), iter_py(), or iter_tuples() "
+            "to iterate over elements of this Array"
+        )
+
+    def to_string(self, width_hint=80, items_hint=10) -> str:
+        cls_name = _repr_utils.make_class_label(self, module="nanoarrow")
+        len_text = f"[{len(self)}]"
+        c_schema_string = _repr_utils.c_schema_to_string(
+            self._data.schema, width_hint - len(cls_name) - len(len_text) - 2
+        )
+
+        lines = [f"{cls_name}<{c_schema_string}>{len_text}"]
+
+        for i, item in enumerate(self.iter_py()):
+            if i >= items_hint:
+                break
+            py_repr = repr(item)
+            if len(py_repr) > width_hint:
+                py_repr = py_repr[: (width_hint - 3)] + "..."
+            lines.append(py_repr)
+
+        n_more_items = len(self) - items_hint
+        if n_more_items > 1:
+            lines.append(f"...and {n_more_items} more items")
+        elif n_more_items > 0:
+            lines.append(f"...and {n_more_items} more item")
+
+        return "\n".join(lines)
+
+    def __repr__(self) -> str:
+        return self.to_string()
diff --git a/python/src/nanoarrow/ipc.py b/python/src/nanoarrow/ipc.py
index 82b719ba..645c3957 100644
--- a/python/src/nanoarrow/ipc.py
+++ b/python/src/nanoarrow/ipc.py
@@ -138,7 +138,7 @@ class Stream:
         """
         out = Stream()
         out._stream = CIpcInputStream.from_readable(
-            open(obj, "rb", *args, **kwargs), close_stream=True
+            open(obj, "rb", *args, **kwargs), close_obj=True
         )
         out._desc = repr(obj)
         return out
@@ -179,7 +179,7 @@ class Stream:
 
         out = Stream()
         out._stream = CIpcInputStream.from_readable(
-            urllib.request.urlopen(obj, *args, **kwargs), close_stream=True
+            urllib.request.urlopen(obj, *args, **kwargs), close_obj=True
         )
         out._desc = repr(obj)
         return out
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index 4b13bc76..ae86fe51 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -28,7 +28,7 @@ from nanoarrow.c_lib import (
 )
 
 
-def iterator(obj, schema=None) -> Iterable:
+def iter_py(obj, schema=None) -> Iterable:
     """Iterate over items in zero or more arrays
 
     Returns an iterator over an array stream where each item is a
@@ -48,18 +48,18 @@ def iterator(obj, schema=None) -> Iterable:
     >>> import nanoarrow as na
     >>> from nanoarrow import iterator
     >>> array = na.c_array([1, 2, 3], na.int32())
-    >>> list(iterator.iterator(array))
+    >>> list(iterator.iter_py(array))
     [1, 2, 3]
     """
-    return RowIterator.get_iterator(obj, schema=schema)
+    return PyIterator.get_iterator(obj, schema=schema)
 
 
-def itertuples(obj, schema=None) -> Iterable[Tuple]:
+def iter_tuples(obj, schema=None) -> Iterable[Tuple]:
     """Iterate over rows in zero or more struct arrays
 
     Returns an iterator over an array stream of struct arrays (i.e.,
     record batches) where each item is a tuple of the items in each
-    row. This is different than :func:`iterator`, which encodes struct
+    row. This is different than :func:`iter_py`, which encodes struct
     columns as dictionaries.
 
     Paramters
@@ -77,14 +77,14 @@ def itertuples(obj, schema=None) -> Iterable[Tuple]:
     >>> from nanoarrow import iterator
     >>> import pyarrow as pa
     >>> array = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
-    >>> list(iterator.itertuples(array))
+    >>> list(iterator.iter_tuples(array))
     [(1,), (2,), (3,)]
     """
     return RowTupleIterator.get_iterator(obj, schema=schema)
 
 
 class ArrayViewIterator:
-    """Base class for iterators that use an internal ArrowArrayStream
+    """Base class for iterators that use an internal ArrowArrayView
     as the basis for conversion to Python objects. Intended for internal use.
     """
 
@@ -127,7 +127,7 @@ class ArrayViewIterator:
         return self
 
 
-class RowIterator(ArrayViewIterator):
+class PyIterator(ArrayViewIterator):
     """Iterate over the Python object version of values in an ArrowArrayView.
     Intended for internal use.
     """
@@ -143,7 +143,9 @@ class RowIterator(ArrayViewIterator):
     def _iter1(self, offset, length):
         type_id = self._schema_view.type_id
         if type_id not in _ITEMS_ITER_LOOKUP:
-            raise KeyError(f"Can't resolve iterator for type 
'{self.schema_view.type}'")
+            raise KeyError(
+                f"Can't resolve iterator for type '{self._schema_view.type}'"
+            )
 
         factory = getattr(self, _ITEMS_ITER_LOOKUP[type_id])
         return factory(offset, length)
@@ -254,7 +256,7 @@ class RowIterator(ArrayViewIterator):
             return iter(items)
 
 
-class RowTupleIterator(RowIterator):
+class RowTupleIterator(PyIterator):
     """Iterate over rows of a struct array (stream) where each row is a
     tuple instead of a dictionary. This is ~3x faster and matches other
     Python concepts more closely (e.g., dbapi's cursor, pandas itertuples).
@@ -270,7 +272,7 @@ class RowTupleIterator(RowIterator):
             )
 
     def _make_child(self, schema, array_view):
-        return RowIterator(schema, _array_view=array_view)
+        return PyIterator(schema, _array_view=array_view)
 
     def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
new file mode 100644
index 00000000..fe590e60
--- /dev/null
+++ b/python/tests/test_array.py
@@ -0,0 +1,282 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+from nanoarrow.c_lib import CArrayStream
+
+import nanoarrow as na
+
+
+def test_array_construct():
+    array = na.Array([], na.int32())
+    assert array.schema.type == na.Type.INT32
+
+    array2 = na.Array(array)
+    assert array2._data is array._data
+
+    array2 = na.Array(array._data)
+    assert array2._data is array._data
+
+    with pytest.raises(TypeError, match="device must be CDevice"):
+        na.Array([], na.int32(), device=1234)
+
+    with pytest.raises(NotImplementedError):
+        iter(array)
+
+
+def test_array_empty():
+    array = na.Array([], na.int32())
+    assert array.schema.type == na.Type.INT32
+    assert len(array) == 0
+
+    assert array.n_buffers == 2
+    assert list(array.buffer(0)) == []
+    assert list(array.buffer(1)) == []
+    assert list(array.iter_buffers()) == []
+
+    assert array.n_children == 0
+
+    assert array.n_chunks == 0
+    assert list(array.iter_chunks()) == []
+    with pytest.raises(IndexError):
+        array.chunk(0)
+
+    assert list(array.iter_py()) == []
+    assert list(array.iter_scalar()) == []
+    with pytest.raises(IndexError):
+        array[0]
+
+    with na.c_array_stream(array) as stream:
+        arrays = list(stream)
+        assert len(arrays) == 0
+
+    c_array = na.c_array(array)
+    assert c_array.length == 0
+    assert c_array.schema.format == "i"
+
+
+def test_array_contiguous():
+    array = na.Array([1, 2, 3], na.int32())
+    assert array.schema.type == na.Type.INT32
+    assert len(array) == 3
+
+    assert array.n_buffers == 2
+
+    validity, data = array.buffers
+    assert list(validity) == []
+    assert list(data) == [1, 2, 3]
+    assert array.buffer(0) is validity
+    assert array.buffer(1) is data
+
+    chunk_buffers = list(array.iter_buffers())
+    assert len(chunk_buffers) == array.n_chunks
+    assert len(chunk_buffers[0]) == array.n_buffers
+    assert list(chunk_buffers[0][1]) == [1, 2, 3]
+
+    assert array.n_children == 0
+    assert list(array.iter_children()) == []
+
+    assert array.n_chunks == 1
+    assert len(list(array.iter_chunks())) == 1
+    assert len(array.chunk(0)) == 3
+
+    # Scalars by iterator
+    for py_item, item in zip([1, 2, 3], array.iter_scalar()):
+        assert item.as_py() == py_item
+
+    # Scalars by __getitem__
+    for py_item, i in zip([1, 2, 3], range(len(array))):
+        assert array[i].as_py() == py_item
+
+    # Python objects by iter_py()
+    for py_item, item in zip([1, 2, 3], array.iter_py()):
+        assert item == py_item
+
+    with na.c_array_stream(array) as stream:
+        arrays = list(stream)
+        assert len(arrays) == 1
+
+    c_array = na.c_array(array)
+    assert c_array.length == 3
+    assert c_array.schema.format == "i"
+
+
+def test_array_chunked():
+    src = [na.c_array([1, 2, 3], na.int32()), na.c_array([4, 5, 6], 
na.int32())]
+
+    array = na.Array(CArrayStream.from_array_list(src, 
na.c_schema(na.int32())))
+    assert array.schema.type == na.Type.INT32
+    assert len(array) == 6
+
+    assert array.n_buffers == 2
+    with pytest.raises(ValueError, match="Can't export ArrowArray"):
+        array.buffers
+
+    chunk_buffers = list(array.iter_buffers())
+    assert len(chunk_buffers) == array.n_chunks
+    assert len(chunk_buffers[0]) == array.n_buffers
+    assert list(chunk_buffers[0][1]) == [1, 2, 3]
+    assert list(chunk_buffers[1][1]) == [4, 5, 6]
+
+    assert array.n_children == 0
+    assert list(array.iter_children()) == []
+
+    assert array.n_children == 0
+    assert list(array.iter_children()) == []
+
+    assert array.n_chunks == 2
+    assert len(list(array.iter_chunks())) == 2
+    assert len(array.chunk(0)) == 3
+
+    for py_item, item in zip([1, 2, 3, 4, 5, 6], array.iter_scalar()):
+        assert item.as_py() == py_item
+
+    for py_item, i in zip([1, 2, 3, 4, 5, 6], range(len(array))):
+        assert array[i].as_py() == py_item
+
+    # Python objects by iter_py()
+    for py_item, item in zip([1, 2, 3], array.iter_py()):
+        assert item == py_item
+
+    with na.c_array_stream(array) as stream:
+        arrays = list(stream)
+        assert len(arrays) == 2
+
+    msg = "Can't export ArrowArray"
+    with pytest.raises(ValueError, match=msg):
+        na.c_array(array)
+
+
+def test_array_children():
+    c_array = na.c_array_from_buffers(
+        na.struct({f"col{i}": na.int32() for i in range(100)}),
+        length=1,
+        buffers=[None],
+        children=[na.c_array([123456], na.int32())] * 100,
+    )
+    src = [c_array, c_array]
+    array = na.Array(CArrayStream.from_array_list(src, c_array.schema))
+
+    assert array.n_children == 100
+    assert array.child(0).schema.type == na.Type.INT32
+    assert array.child(0).n_chunks == 2
+    assert list(array.child(0).iter_py()) == [123456, 123456]
+
+    children = list(array.iter_children())
+    assert len(children) == array.n_children
+
+    tuples = list(array.iter_tuples())
+    assert len(tuples) == 2
+    assert len(tuples[0]) == 100
+
+
+def test_scalar_to_array():
+    array = na.Array([123456, 7890], na.int32())
+    scalar = scalar = array[1]
+    assert scalar.schema is array.schema
+    assert scalar.device is array.device
+    as_array = na.c_array(scalar)
+    assert as_array.offset == 1
+    assert as_array.length == 1
+    assert as_array.buffers == na.c_array(array).buffers
+
+    with pytest.raises(NotImplementedError):
+        na.c_array(scalar, na.string())
+
+
+def test_scalar_repr():
+    # Check a scalar repr that does not need truncation
+    scalar = na.Array([123456], na.int32())[0]
+    assert repr(scalar) == "Scalar<int32> 123456"
+
+    # Check a long Scalar repr that needs truncation
+    c_array = na.c_array_from_buffers(
+        na.struct({f"col{i}": na.int32() for i in range(100)}),
+        length=1,
+        buffers=[None],
+        children=[na.c_array([123456], na.int32())] * 100,
+    )
+    scalar = na.Array(c_array)[0]
+    assert repr(scalar) == (
+        "Scalar<struct<col0: int3...> {'col0': 123456, "
+        "'col1': 123456, 'col2': 123456,..."
+    )
+    assert len(repr(scalar)) == 80
+
+
+def test_scalar_repr_long():
+    pa = pytest.importorskip("pyarrow")
+    scalar = na.Array(pa.array(["abcdefg" * 10]))[0]
+    assert repr(scalar).endswith("...")
+    assert len(repr(scalar)) == 80
+
+
+def test_array_repr():
+    array = na.Array(range(10), na.int32())
+    one_to_ten = "\n".join(str(i) for i in range(10))
+
+    assert repr(array) == f"nanoarrow.Array<int32>[10]\n{one_to_ten}"
+
+    array = na.Array(range(11), na.int32())
+    assert (
+        repr(array) == f"nanoarrow.Array<int32>[11]\n{one_to_ten}\n...and 1 
more item"
+    )
+
+    array = na.Array(range(12), na.int32())
+    assert (
+        repr(array) == f"nanoarrow.Array<int32>[12]\n{one_to_ten}\n...and 2 
more items"
+    )
+
+
+def test_wide_array_repr():
+    c_array = na.c_array_from_buffers(
+        na.struct({f"col{i}": na.int32() for i in range(100)}),
+        length=1,
+        buffers=[None],
+        children=[na.c_array([123456], na.int32())] * 100,
+    )
+    array = na.Array(c_array)
+
+    repr_lines = repr(array).splitlines()
+
+    # Check abbreviated schema
+    assert repr_lines[0] == (
+        "nanoarrow.Array<struct<col0: int32, col1: int32, col2"
+        ": int32, col3: int32...>[1]"
+    )
+    assert len(repr_lines[0]) == 80
+
+    # Check an abbreviated value
+    assert len(repr_lines[1]) == 80
+
+
+def test_array_repr_long():
+    pa = pytest.importorskip("pyarrow")
+
+    # Check that exact length is not truncated with a ...
+    array = na.Array(pa.array(["a" * 78]))
+    repr_lines = repr(array).splitlines()
+    assert len(repr_lines) == 2
+    assert not repr_lines[1].endswith("...")
+    assert len(repr_lines[1]) == 80
+
+    # Check that wide output is truncated with a ...
+    array = na.Array(pa.array(["a" * 79]))
+    repr_lines = repr(array).splitlines()
+    assert len(repr_lines) == 2
+    assert repr_lines[1].endswith("...")
+    assert len(repr_lines[1]) == 80
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index 046c2de2..75ab2aa7 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -134,7 +134,7 @@ def test_c_array_slice_errors():
     array = na.c_array([1, 2, 3], na.int32())
 
     with pytest.raises(TypeError):
-        array[0]
+        array[None]
     with pytest.raises(IndexError):
         array[4:]
     with pytest.raises(IndexError):
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 95d0218a..abba0846 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -16,17 +16,17 @@
 # under the License.
 
 import pytest
-from nanoarrow.iterator import iterator, itertuples
+from nanoarrow.iterator import iter_py, iter_tuples
 
 import nanoarrow as na
 
 
 def test_iterator_primitive():
     array = na.c_array([1, 2, 3], na.int32())
-    assert list(iterator(array)) == [1, 2, 3]
+    assert list(iter_py(array)) == [1, 2, 3]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [2, 3]
+    assert list(iter_py(sliced)) == [2, 3]
 
 
 def test_iterator_nullable_primitive():
@@ -38,10 +38,10 @@ def test_iterator_nullable_primitive():
             na.c_buffer([1, 2, 3, 0], na.int32()),
         ],
     )
-    assert list(iterator(array)) == [1, 2, 3, None]
+    assert list(iter_py(array)) == [1, 2, 3, None]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [2, 3, None]
+    assert list(iter_py(sliced)) == [2, 3, None]
 
 
 def test_iterator_string():
@@ -49,10 +49,10 @@ def test_iterator_string():
         na.string(), 2, buffers=[None, na.c_buffer([0, 2, 5], na.int32()), 
b"abcde"]
     )
 
-    assert list(iterator(array)) == ["ab", "cde"]
+    assert list(iter_py(array)) == ["ab", "cde"]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == ["cde"]
+    assert list(iter_py(sliced)) == ["cde"]
 
 
 def test_iterator_nullable_string():
@@ -66,10 +66,10 @@ def test_iterator_nullable_string():
         ],
     )
 
-    assert list(iterator(array)) == ["ab", "cde", None]
+    assert list(iter_py(array)) == ["ab", "cde", None]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == ["cde", None]
+    assert list(iter_py(sliced)) == ["cde", None]
 
 
 def test_iterator_binary():
@@ -77,10 +77,10 @@ def test_iterator_binary():
         na.binary(), 2, buffers=[None, na.c_buffer([0, 2, 5], na.int32()), 
b"abcde"]
     )
 
-    assert list(iterator(array)) == [b"ab", b"cde"]
+    assert list(iter_py(array)) == [b"ab", b"cde"]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [b"cde"]
+    assert list(iter_py(sliced)) == [b"cde"]
 
 
 def test_iterator_nullable_binary():
@@ -94,13 +94,13 @@ def test_iterator_nullable_binary():
         ],
     )
 
-    assert list(iterator(array)) == [b"ab", b"cde", None]
+    assert list(iter_py(array)) == [b"ab", b"cde", None]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [b"cde", None]
+    assert list(iter_py(sliced)) == [b"cde", None]
 
 
-def test_itertuples():
+def test_iter_tuples():
     array = na.c_array_from_buffers(
         na.struct({"col1": na.int32(), "col2": na.bool()}),
         length=3,
@@ -108,10 +108,10 @@ def test_itertuples():
         children=[na.c_array([1, 2, 3], na.int32()), na.c_array([1, 0, 1], 
na.bool())],
     )
 
-    assert list(itertuples(array)) == [(1, True), (2, False), (3, True)]
+    assert list(iter_tuples(array)) == [(1, True), (2, False), (3, True)]
 
     sliced = array[1:]
-    assert list(itertuples(sliced)) == [(2, False), (3, True)]
+    assert list(iter_tuples(sliced)) == [(2, False), (3, True)]
 
     sliced_child = na.c_array_from_buffers(
         array.schema,
@@ -119,13 +119,13 @@ def test_itertuples():
         buffers=[None],
         children=[array.child(0)[1:], array.child(1)[1:]],
     )
-    assert list(itertuples(sliced_child)) == [(2, False), (3, True)]
+    assert list(iter_tuples(sliced_child)) == [(2, False), (3, True)]
 
     nested_sliced = sliced_child[1:]
-    assert list(itertuples(nested_sliced)) == [(3, True)]
+    assert list(iter_tuples(nested_sliced)) == [(3, True)]
 
 
-def test_itertuples_nullable():
+def test_iter_tuples_nullable():
     array = na.c_array_from_buffers(
         na.struct({"col1": na.int32(), "col2": na.bool()}),
         length=4,
@@ -136,10 +136,10 @@ def test_itertuples_nullable():
         ],
     )
 
-    assert list(itertuples(array)) == [(1, True), (2, False), (3, True), None]
+    assert list(iter_tuples(array)) == [(1, True), (2, False), (3, True), None]
 
     sliced = array[1:]
-    assert list(itertuples(sliced)) == [(2, False), (3, True), None]
+    assert list(iter_tuples(sliced)) == [(2, False), (3, True), None]
 
     sliced_child = na.c_array_from_buffers(
         array.schema,
@@ -147,15 +147,15 @@ def test_itertuples_nullable():
         buffers=[na.c_buffer([True, True, False], na.bool())],
         children=[array.child(0)[1:], array.child(1)[1:]],
     )
-    assert list(itertuples(sliced_child)) == [(2, False), (3, True), None]
+    assert list(iter_tuples(sliced_child)) == [(2, False), (3, True), None]
 
     nested_sliced = sliced_child[1:]
-    assert list(itertuples(nested_sliced)) == [(3, True), None]
+    assert list(iter_tuples(nested_sliced)) == [(3, True), None]
 
 
-def test_itertuples_errors():
+def test_iter_tuples_errors():
     with pytest.raises(TypeError, match="can only iterate over struct arrays"):
-        list(itertuples(na.c_array([1, 2, 3], na.int32())))
+        list(iter_tuples(na.c_array([1, 2, 3], na.int32())))
 
 
 def test_iterator_struct():
@@ -166,14 +166,14 @@ def test_iterator_struct():
         children=[na.c_array([1, 2, 3], na.int32()), na.c_array([1, 0, 1], 
na.bool())],
     )
 
-    assert list(iterator(array)) == [
+    assert list(iter_py(array)) == [
         {"col1": 1, "col2": True},
         {"col1": 2, "col2": False},
         {"col1": 3, "col2": True},
     ]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [
+    assert list(iter_py(sliced)) == [
         {"col1": 2, "col2": False},
         {"col1": 3, "col2": True},
     ]
@@ -190,7 +190,7 @@ def test_iterator_nullable_struct():
         ],
     )
 
-    assert list(iterator(array)) == [
+    assert list(iter_py(array)) == [
         {"col1": 1, "col2": True},
         {"col1": 2, "col2": False},
         {"col1": 3, "col2": True},
@@ -198,7 +198,7 @@ def test_iterator_nullable_struct():
     ]
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [
+    assert list(iter_py(sliced)) == [
         {"col1": 2, "col2": False},
         {"col1": 3, "col2": True},
         None,
@@ -209,13 +209,13 @@ def test_iterator_list():
     pa = pytest.importorskip("pyarrow")
     items = [[1, 2, 3], [4, 5, 6], [7, 8, None], [0]]
     array = pa.array(items)
-    assert list(iterator(array)) == items
+    assert list(iter_py(array)) == items
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [[4, 5, 6], [7, 8, None], [0]]
+    assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None], [0]]
 
     array_sliced_child = pa.ListArray.from_arrays([0, 2, 5, 8, 9], 
array.values[1:])
-    assert (list(iterator(array_sliced_child))) == [
+    assert (list(iter_py(array_sliced_child))) == [
         [2, 3],
         [4, 5, 6],
         [7, 8, None],
@@ -223,7 +223,7 @@ def test_iterator_list():
     ]
 
     nested_sliced = array_sliced_child[1:]
-    assert (list(iterator(nested_sliced))) == [
+    assert (list(iter_py(nested_sliced))) == [
         [4, 5, 6],
         [7, 8, None],
         [0],
@@ -234,17 +234,17 @@ def test_iterator_nullable_list():
     pa = pytest.importorskip("pyarrow")
     items = [[1, 2, 3], [4, 5, 6], [7, 8, None], [0], None]
     array = pa.array(items)
-    assert list(iterator(array)) == items
+    assert list(iter_py(array)) == items
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [[4, 5, 6], [7, 8, None], [0], None]
+    assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None], [0], None]
 
     array_sliced_child = pa.ListArray.from_arrays(
         [0, 2, 5, 8, 9, 9],
         array.values[1:],
         mask=pa.array([False, False, False, False, True]),
     )
-    assert (list(iterator(array_sliced_child))) == [
+    assert (list(iter_py(array_sliced_child))) == [
         [2, 3],
         [4, 5, 6],
         [7, 8, None],
@@ -253,42 +253,42 @@ def test_iterator_nullable_list():
     ]
 
     nested_sliced = array_sliced_child[1:]
-    assert (list(iterator(nested_sliced))) == [[4, 5, 6], [7, 8, None], [0], 
None]
+    assert (list(iter_py(nested_sliced))) == [[4, 5, 6], [7, 8, None], [0], 
None]
 
 
 def test_iterator_fixed_size_list():
     pa = pytest.importorskip("pyarrow")
     items = [[1, 2, 3], [4, 5, 6], [7, 8, None]]
     array = pa.array(items, pa.list_(pa.int64(), 3))
-    assert list(iterator(array)) == items
+    assert list(iter_py(array)) == items
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [[4, 5, 6], [7, 8, None]]
+    assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None]]
 
     array_sliced_child = pa.FixedSizeListArray.from_arrays(array.values[3:], 3)
-    assert (list(iterator(array_sliced_child))) == [[4, 5, 6], [7, 8, None]]
+    assert (list(iter_py(array_sliced_child))) == [[4, 5, 6], [7, 8, None]]
 
     nested_sliced = array_sliced_child[1:]
-    assert (list(iterator(nested_sliced))) == [[7, 8, None]]
+    assert (list(iter_py(nested_sliced))) == [[7, 8, None]]
 
 
 def test_iterator_nullable_fixed_size_list():
     pa = pytest.importorskip("pyarrow")
     items = [[1, 2, 3], [4, 5, 6], [7, 8, None], None]
     array = pa.array(items, pa.list_(pa.int64(), 3))
-    assert list(iterator(array)) == items
+    assert list(iter_py(array)) == items
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == [[4, 5, 6], [7, 8, None], None]
+    assert list(iter_py(sliced)) == [[4, 5, 6], [7, 8, None], None]
 
     # mask argument only available for pyarrow >= 15.0.0
     array_sliced_child = pa.FixedSizeListArray.from_arrays(
         array.values[3:], 3, mask=pa.array([False, False, True])
     )
-    assert (list(iterator(array_sliced_child))) == [[4, 5, 6], [7, 8, None], 
None]
+    assert (list(iter_py(array_sliced_child))) == [[4, 5, 6], [7, 8, None], 
None]
 
     nested_sliced = array_sliced_child[1:]
-    assert (list(iterator(nested_sliced))) == [[7, 8, None], None]
+    assert (list(iter_py(nested_sliced))) == [[7, 8, None], None]
 
 
 def test_iterator_dictionary():
@@ -297,10 +297,10 @@ def test_iterator_dictionary():
     items = ["ab", "cde", "ab", "def", "cde"]
     array = pa.array(items).dictionary_encode()
 
-    assert list(iterator(array)) == items
+    assert list(iter_py(array)) == items
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == ["cde", "ab", "def", "cde"]
+    assert list(iter_py(sliced)) == ["cde", "ab", "def", "cde"]
 
 
 def test_iterator_nullable_dictionary():
@@ -309,7 +309,7 @@ def test_iterator_nullable_dictionary():
     items = ["ab", "cde", "ab", "def", "cde", None]
     array = pa.array(items).dictionary_encode()
 
-    assert list(iterator(array)) == items
+    assert list(iter_py(array)) == items
 
     sliced = array[1:]
-    assert list(iterator(sliced)) == ["cde", "ab", "def", "cde", None]
+    assert list(iter_py(sliced)) == ["cde", "ab", "def", "cde", None]
diff --git a/src/nanoarrow/buffer_inline.h b/src/nanoarrow/buffer_inline.h
index b5fc627b..d6d74d82 100644
--- a/src/nanoarrow/buffer_inline.h
+++ b/src/nanoarrow/buffer_inline.h
@@ -28,6 +28,28 @@
 extern "C" {
 #endif
 
+// Modified from Arrow C++ (1eb46f76) cpp/src/arrow/chunk_resolver.h#L133-L162
+static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* 
offsets,
+                                          int64_t lo, int64_t hi) {
+  // Similar to std::upper_bound(), but slightly different as our offsets
+  // array always starts with 0.
+  int64_t n = hi - lo;
+  // First iteration does not need to check for n > 1
+  // (lo < hi is guaranteed by the precondition).
+  NANOARROW_DCHECK(n > 1);
+  do {
+    const int64_t m = n >> 1;
+    const int64_t mid = lo + m;
+    if (index >= offsets[mid]) {
+      lo = mid;
+      n -= m;
+    } else {
+      n = m;
+    }
+  } while (n > 1);
+  return lo;
+}
+
 static inline int64_t _ArrowGrowByFactor(int64_t current_capacity, int64_t 
new_capacity) {
   int64_t doubled_capacity = current_capacity * 2;
   if (doubled_capacity > new_capacity) {
diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h
index 990ee13f..ed07bfd7 100644
--- a/src/nanoarrow/nanoarrow.h
+++ b/src/nanoarrow/nanoarrow.h
@@ -289,6 +289,14 @@ ArrowErrorCode ArrowDecimalSetDigits(struct ArrowDecimal* 
decimal,
 ArrowErrorCode ArrowDecimalAppendDigitsToBuffer(const struct ArrowDecimal* 
decimal,
                                                 struct ArrowBuffer* buffer);
 
+/// \brief Resolve a chunk index from increasing int64_t offsets
+///
+/// Given a buffer of increasing int64_t offsets that begin with 0 (e.g., 
offset buffer
+/// of a large type, run ends of a chunked array implementation), resolve a 
value v
+/// where lo <= v < hi such that offsets[v] <= index < offsets[v + 1].
+static inline int64_t ArrowResolveChunk64(int64_t index, const int64_t* 
offsets,
+                                          int64_t lo, int64_t hi);
+
 /// @}
 
 /// \defgroup nanoarrow-schema Creating schemas
diff --git a/src/nanoarrow/utils_test.cc b/src/nanoarrow/utils_test.cc
index 8fca03b7..8b4fe922 100644
--- a/src/nanoarrow/utils_test.cc
+++ b/src/nanoarrow/utils_test.cc
@@ -538,3 +538,15 @@ TEST(DecimalTest, DecimalRoundtripBitshiftTest) {
 
   ArrowBufferReset(&buffer);
 }
+
+TEST(UtilsTest, ArrowResolveChunk64Test) {
+  int64_t offsets[] = {0, 2, 3, 6};
+  int64_t n_offsets = 4;
+
+  EXPECT_EQ(ArrowResolveChunk64(0, offsets, 0, n_offsets), 0);
+  EXPECT_EQ(ArrowResolveChunk64(1, offsets, 0, n_offsets), 0);
+  EXPECT_EQ(ArrowResolveChunk64(2, offsets, 0, n_offsets), 1);
+  EXPECT_EQ(ArrowResolveChunk64(3, offsets, 0, n_offsets), 2);
+  EXPECT_EQ(ArrowResolveChunk64(4, offsets, 0, n_offsets), 2);
+  EXPECT_EQ(ArrowResolveChunk64(5, offsets, 0, n_offsets), 2);
+}

(arrow-nanoarrow) branch main updated: feat(python): Add user-facing `Array` class (#396)

Reply via email to