(arrow-nanoarrow) branch main updated: feat(python): Add `Array.from_chunks()` constructor (#456)

paleolimbot Wed, 08 May 2024 13:11:47 -0700

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git



The following commit(s) were added to refs/heads/main by this push:
     new acbf6786 feat(python): Add `Array.from_chunks()` constructor (#456)
acbf6786 is described below

commit acbf67864e771915c344e449a0f1fddb0e0a1170
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed May 8 17:11:36 2024 -0300

    feat(python): Add `Array.from_chunks()` constructor (#456)
    
    This PR adds a public route to construct chunked arrays (and makes the
    other constructors safer to account for the fact that they are now
    user-facing). I use this quite a lot interactively to test that things
    work in the chunked case, and for nanoarrow to be useful in a "I can
    help you export things" kind of way, it needs to be able to do this
    (because string arrays with more than 2 GB of text or binary are not
    uncommon).
    
    The main safety consideration here is ensuring that all chunks have a
    schema of the same type, so I had to add a function to check for that
    (and ensure it was being checked).
    
    ```python
    import nanoarrow as na
    import numpy as np
    
    na.Array.from_chunks([[1, 2, 3], [4, 5, 6]], na.int32())
    na.Array.from_chunks((np.random.random(int(1e3)) for _ in range(int(1e3))))
    ```
---
 python/src/nanoarrow/_lib.pyx          | 167 ++++++++++++++++++++++++---------
 python/src/nanoarrow/array.py          |  56 ++++++++++-
 python/src/nanoarrow/c_array_stream.py |   2 +-
 python/src/nanoarrow/iterator.py       |   4 +-
 python/tests/test_array.py             |  45 ++++++++-
 python/tests/test_c_array_stream.py    |  28 +++---
 python/tests/test_c_schema.py          |  64 +++++++++++++
 python/tests/test_capsules.py          |   4 +-
 python/tests/test_device.py            |   6 +-
 python/tests/test_ipc.py               |   6 +-
 python/tests/test_nanoarrow.py         |   1 +
 11 files changed, 308 insertions(+), 75 deletions(-)

diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 690afa2b..b99a6505 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -769,6 +769,47 @@ cdef class CSchema:
     def __repr__(self):
         return _repr_utils.schema_repr(self)
 
+    def type_equals(self, CSchema other, check_nullability=False):
+        self._assert_valid()
+
+        if self._ptr == other._ptr:
+            return True
+
+        if self.format != other.format:
+            return False
+
+        # Nullability is not strictly part of the "type"; however, performing
+        # this check recursively is verbose to otherwise accomplish and
+        # sometimes this does matter.
+        cdef int64_t flags = self.flags
+        cdef int64_t other_flags = other.flags
+        if not check_nullability:
+            flags &= ~ARROW_FLAG_NULLABLE
+            other_flags &= ~ARROW_FLAG_NULLABLE
+
+        if flags != other_flags:
+            return False
+
+        if self.n_children != other.n_children:
+            return False
+
+        for child, other_child in zip(self.children, other.children):
+            if not child.type_equals(other_child, 
check_nullability=check_nullability):
+                return False
+
+        if (self.dictionary is None) != (other.dictionary is None):
+            return False
+
+        if self.dictionary is not None:
+            if not self.dictionary.type_equals(
+                other.dictionary,
+                check_nullability=check_nullability
+            ):
+                return False
+
+        return True
+
+
     @property
     def format(self):
         self._assert_valid()
@@ -874,6 +915,24 @@ cdef class CSchema:
 
         return builder.finish()
 
+# This is likely a better fit for a dedicated testing module; however, we need
+# it in _lib.pyx to produce nice error messages when ensuring that one or
+# more arrays conform to a given or inferred schema.
+def assert_type_equal(actual, expected):
+    if not isinstance(actual, CSchema):
+        raise TypeError(f"expected is {type(actual).__name__}, not CSchema")
+
+    if not isinstance(expected, CSchema):
+        raise TypeError(f"expected is {type(expected).__name__}, not CSchema")
+
+    if not actual.type_equals(expected):
+        actual_label = actual._to_string(max_chars=80, recursive=True)
+        expected_label = expected._to_string(max_chars=80, recursive=True)
+        raise ValueError(
+            f"Expected schema\n  '{expected_label}'"
+            f"\nbut got\n  '{actual_label}'"
+        )
+
 
 cdef class CSchemaView:
     """Low-level ArrowSchemaView wrapper
@@ -1359,9 +1418,9 @@ cdef class CArray:
         cdef int64_t start = 0 if k.start is None else k.start
         cdef int64_t stop = self._ptr.length if k.stop is None else k.stop
         if start < 0:
-            start = self.length + start
+            start = self._ptr.length + start
         if stop < 0:
-            stop = self.length + stop
+            stop = self._ptr.length + stop
 
         if start > self._ptr.length or stop > self._ptr.length or stop < start:
             raise IndexError(
@@ -1448,11 +1507,14 @@ cdef class CArray:
     def device_id(self):
         return self._device_id
 
-    @property
-    def length(self):
+    def __len__(self):
         self._assert_valid()
         return self._ptr.length
 
+    @property
+    def length(self):
+        return len(self)
+
     @property
     def offset(self):
         self._assert_valid()
@@ -1557,9 +1619,12 @@ cdef class CArrayView:
     def layout(self):
         return CLayout(self, <uintptr_t>&self._ptr.layout)
 
+    def __len__(self):
+        return self._ptr.length
+
     @property
     def length(self):
-        return self._ptr.length
+        return len(self)
 
     @property
     def offset(self):
@@ -2700,20 +2765,33 @@ cdef class CArrayStream:
         return CArrayStream(base, <uintptr_t>c_array_stream_out)
 
     @staticmethod
-    def from_array_list(arrays, CSchema schema, move=False, validate=True):
+    def from_c_arrays(arrays, CSchema schema, move=False, validate=True):
         cdef ArrowArrayStream* c_array_stream_out
         base = alloc_c_array_stream(&c_array_stream_out)
 
-        if not move:
-            schema = schema.__deepcopy__()
-
-        cdef int code = ArrowBasicArrayStreamInit(c_array_stream_out, 
schema._ptr, len(arrays))
+        # Don't create more copies than we have to (but make sure
+        # one exists for validation if requested)
+        cdef CSchema out_schema = schema
+        if validate and not move:
+            validate_schema = schema
+            out_schema = schema.__deepcopy__()
+        elif validate:
+            validate_schema = schema.__deepcopy__()
+            out_schema = schema
+        elif not move:
+            out_schema = schema.__deepcopy__()
+
+        cdef int code = ArrowBasicArrayStreamInit(c_array_stream_out, 
out_schema._ptr, len(arrays))
         Error.raise_error_not_ok("ArrowBasicArrayStreamInit()", code)
 
         cdef ArrowArray tmp
         cdef CArray array
         for i in range(len(arrays)):
             array = arrays[i]
+
+            if validate:
+                assert_type_equal(array.schema, validate_schema)
+
             if not move:
                 c_array_shallow_copy(array._base, array._ptr, &tmp)
                 ArrowBasicArrayStreamSetArray(c_array_stream_out, i, &tmp)
@@ -2887,7 +2965,7 @@ cdef class CMaterializedArrayStream:
 
     def __iter__(self):
         for c_array in self._arrays:
-            for item_i in range(c_array.length):
+            for item_i in range(len(c_array)):
                 yield c_array, item_i
 
     def array(self, int64_t i):
@@ -2904,7 +2982,13 @@ cdef class CMaterializedArrayStream:
     def __arrow_c_stream__(self, requested_schema=None):
         # When an array stream from iterable is supported, that could be used 
here
         # to avoid unnessary shallow copies.
-        stream = CArrayStream.from_array_list(self._arrays, self._schema, 
move=False)
+        stream = CArrayStream.from_c_arrays(
+            self._arrays,
+            self._schema,
+            move=False,
+            validate=False
+        )
+
         return stream.__arrow_c_stream__(requested_schema=requested_schema)
 
     def child(self, int64_t i):
@@ -2914,7 +2998,7 @@ cdef class CMaterializedArrayStream:
         out._schema = self._schema.child(i)
         out._arrays = [chunk.child(i) for chunk in self._arrays]
         for child_chunk in out._arrays:
-            out._total_length += child_chunk.length
+            out._total_length += len(child_chunk)
             code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
             Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
 
@@ -2922,45 +3006,44 @@ cdef class CMaterializedArrayStream:
         return out
 
     @staticmethod
-    def from_c_array(CArray array):
-        array._assert_valid()
-
+    def from_c_arrays(arrays, CSchema schema, bint validate=True):
         cdef CMaterializedArrayStream out = CMaterializedArrayStream()
-        out._schema = array._schema
 
-        if array._ptr.length == 0:
-            out._finalize()
-            return out
+        for array in arrays:
+            if not isinstance(array, CArray):
+                raise TypeError(f"Expected CArray but got 
{type(array).__name__}")
 
-        out._arrays.append(array)
-        out._total_length += array._ptr.length
-        cdef int code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
-        Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+            if len(array) == 0:
+                continue
+
+            if validate:
+                assert_type_equal(array.schema, schema)
+
+            out._total_length += len(array)
+            code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
+            Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
+            out._arrays.append(array)
 
+        out._schema = schema
         out._finalize()
         return out
 
     @staticmethod
-    def from_c_array_stream(CArrayStream stream):
-        stream._assert_valid()
-        cdef CMaterializedArrayStream out = CMaterializedArrayStream()
-        cdef int code
-        cdef CArray array
+    def from_c_array(CArray array):
+        return CMaterializedArrayStream.from_c_arrays(
+            [array],
+            array.schema,
+            validate=False
+        )
 
+    @staticmethod
+    def from_c_array_stream(CArrayStream stream):
         with stream:
-            for array in stream:
-                if array._ptr.length == 0:
-                    continue
-
-                out._total_length += array._ptr.length
-                code = ArrowBufferAppendInt64(out._array_ends._ptr, 
out._total_length)
-                Error.raise_error_not_ok("ArrowBufferAppendInt64()", code)
-                out._arrays.append(array)
-
-            out._schema = stream._get_cached_schema()
-
-        out._finalize()
-        return out
+            return CMaterializedArrayStream.from_c_arrays(
+                stream,
+                stream._get_cached_schema(),
+                validate=False
+            )
 
 
 cdef class CDeviceArray:
diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py
index 35d58c40..eabe8f2a 100644
--- a/python/src/nanoarrow/array.py
+++ b/python/src/nanoarrow/array.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import itertools
 from functools import cached_property
 from typing import Iterable, Tuple
 
@@ -28,6 +29,7 @@ from nanoarrow._lib import (
 )
 from nanoarrow.c_array import c_array, c_array_view
 from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema
 from nanoarrow.iterator import iter_array_views, iter_py, iter_tuples
 from nanoarrow.schema import Schema
 
@@ -161,6 +163,56 @@ class Array:
         with c_array_stream(obj, schema=schema) as stream:
             self._data = CMaterializedArrayStream.from_c_array_stream(stream)
 
+    @staticmethod
+    def from_chunks(obj: Iterable, schema=None, validate: bool = True):
+        """Create an Array with explicit chunks
+
+        Creates an :class:`Array` with explicit chunking from an iterable of
+        objects that can be converted to a :func:`c_array`.
+
+        Parameters
+        ----------
+        obj : iterable of array-like
+            An iterable of objects that can be passed to :func:`c_array`.
+        schema : schema-like, optional
+            An optional schema. If present, will be passed to :func:`c_array`
+            for each item in obj; if not present it will be inferred from the 
first
+            chunk.
+        validate : bool
+            Use ``False`` to opt out of validation steps performed when 
constructing
+            this array.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> na.Array.from_chunks([[1, 2, 3], [4, 5, 6]], na.int32())
+        nanoarrow.Array<int32>[6]
+        1
+        2
+        3
+        4
+        5
+        6
+        """
+        obj = iter(obj)
+
+        if schema is None:
+            first = next(obj, None)
+            if first is None:
+                raise ValueError("Can't create empty Array from chunks without 
schema")
+
+            first = c_array(first)
+            out_schema = first.schema
+            obj = itertools.chain([first], obj)
+        else:
+            out_schema = c_schema(schema)
+
+        data = CMaterializedArrayStream.from_c_arrays(
+            (c_array(item, schema) for item in obj), out_schema, 
validate=validate
+        )
+
+        return Array(data)
+
     def _assert_one_chunk(self, op):
         if self._data.n_arrays != 1:
             raise ValueError(f"Can't {op} with non-contiguous Array")
@@ -278,9 +330,9 @@ class Array:
         >>> import nanoarrow as na
         >>> array = na.Array([1, 2, 3], na.int32())
         >>> for view in array.iter_chunk_views():
-        ...     offset, length = view.offset, view.length
+        ...     offset, length = view.offset, len(view)
         ...     validity, data = view.buffers
-        ...     print(view.offset, view.length)
+        ...     print(offset, length)
         ...     print(validity)
         ...     print(data)
         0 3
diff --git a/python/src/nanoarrow/c_array_stream.py 
b/python/src/nanoarrow/c_array_stream.py
index 816f2696..77eeaaf6 100644
--- a/python/src/nanoarrow/c_array_stream.py
+++ b/python/src/nanoarrow/c_array_stream.py
@@ -88,7 +88,7 @@ def c_array_stream(obj=None, schema=None) -> CArrayStream:
 
     try:
         array = c_array(obj, schema=schema)
-        return CArrayStream.from_array_list([array], array.schema, 
validate=False)
+        return CArrayStream.from_c_arrays([array], array.schema, 
validate=False)
     except Exception as e:
         raise TypeError(
             f"An error occurred whilst converting {type(obj).__name__} "
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index aef56263..76f2a775 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -140,7 +140,7 @@ class ArrayViewBaseIterator:
             iterator = cls(stream._get_cached_schema())
             for array in stream:
                 iterator._set_array(array)
-                yield from iterator._iter_chunk(0, array.length)
+                yield from iterator._iter_chunk(0, len(array))
 
     def __init__(self, schema, *, _array_view=None):
         self._schema = c_schema(schema)
@@ -222,7 +222,7 @@ class PyIterator(ArrayViewBaseIterator):
 
     def _dictionary_iter(self, offset, length):
         dictionary = list(
-            self._dictionary._iter_chunk(0, 
self._dictionary._array_view.length)
+            self._dictionary._iter_chunk(0, len(self._dictionary._array_view))
         )
         for dict_index in self._primitive_iter(offset, length):
             yield None if dict_index is None else dictionary[dict_index]
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index 5ef75a69..a6a79eec 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -43,6 +43,41 @@ def test_array_alias_constructor():
     assert array.schema.type == na.Type.INT32
 
 
+def test_array_from_chunks():
+    # Check with explicit schema
+    array = na.Array.from_chunks([[1, 2, 3], [4, 5, 6]], na.int32())
+    assert array.schema.type == na.Type.INT32
+    assert array.n_chunks == 2
+    assert list(array.iter_py()) == [1, 2, 3, 4, 5, 6]
+
+    # Check with schema inferred from first chunk
+    array = na.Array.from_chunks(array.iter_chunks())
+    assert array.schema.type == na.Type.INT32
+    assert array.n_chunks == 2
+    assert list(array.iter_py()) == [1, 2, 3, 4, 5, 6]
+
+    # Check empty
+    array = na.Array.from_chunks([], na.int32())
+    assert array.schema.type == na.Type.INT32
+    assert len(array) == 0
+    assert array.n_chunks == 0
+
+    msg = "Can't create empty Array from chunks without schema"
+    with pytest.raises(ValueError, match=msg):
+        na.Array.from_chunks([])
+
+
+def test_array_from_chunks_validate():
+    chunks = [na.c_array([1, 2, 3], na.uint32()), na.c_array([1, 2, 3], 
na.int32())]
+    # Check that we get validation by default
+    with pytest.raises(ValueError, match="Expected schema"):
+        na.Array.from_chunks(chunks)
+
+    # ...but that one can opt out
+    array = na.Array.from_chunks(chunks, validate=False)
+    assert list(array.iter_py()) == [1, 2, 3, 1, 2, 3]
+
+
 def test_array_empty():
     array = na.Array([], na.int32())
     assert array.schema.type == na.Type.INT32
@@ -71,7 +106,7 @@ def test_array_empty():
         assert len(arrays) == 0
 
     c_array = na.c_array(array)
-    assert c_array.length == 0
+    assert len(c_array) == 0
     assert c_array.schema.format == "i"
 
 
@@ -118,14 +153,14 @@ def test_array_contiguous():
         assert len(arrays) == 1
 
     c_array = na.c_array(array)
-    assert c_array.length == 3
+    assert len(c_array) == 3
     assert c_array.schema.format == "i"
 
 
 def test_array_chunked():
     src = [na.c_array([1, 2, 3], na.int32()), na.c_array([4, 5, 6], 
na.int32())]
 
-    array = na.Array(CArrayStream.from_array_list(src, 
na.c_schema(na.int32())))
+    array = na.Array(CArrayStream.from_c_arrays(src, na.c_schema(na.int32())))
     assert array.schema.type == na.Type.INT32
     assert len(array) == 6
 
@@ -176,7 +211,7 @@ def test_array_children():
         children=[na.c_array([123456], na.int32())] * 100,
     )
     src = [c_array, c_array]
-    array = na.Array(CArrayStream.from_array_list(src, c_array.schema))
+    array = na.Array(CArrayStream.from_c_arrays(src, c_array.schema))
 
     assert array.n_children == 100
     assert array.child(0).schema.type == na.Type.INT32
@@ -198,7 +233,7 @@ def test_scalar_to_array():
     assert scalar.device is array.device
     as_array = na.c_array(scalar)
     assert as_array.offset == 1
-    assert as_array.length == 1
+    assert len(as_array) == 1
     assert as_array.buffers == na.c_array(array).buffers
 
     with pytest.raises(NotImplementedError):
diff --git a/python/tests/test_c_array_stream.py 
b/python/tests/test_c_array_stream.py
index 0fe38f4a..a788302c 100644
--- a/python/tests/test_c_array_stream.py
+++ b/python/tests/test_c_array_stream.py
@@ -16,7 +16,6 @@
 # under the License.
 
 import pytest
-from nanoarrow._lib import NanoarrowException
 from nanoarrow.c_array_stream import CArrayStream
 
 import nanoarrow as na
@@ -24,12 +23,12 @@ import nanoarrow as na
 
 def test_c_array_stream_from_c_array_stream():
     # Wrapping an existing stream is a no-op
-    array_stream = CArrayStream.from_array_list([], na.c_schema(na.int32()))
+    array_stream = CArrayStream.from_c_arrays([], na.c_schema(na.int32()))
     stream_from_stream = na.c_array_stream(array_stream)
     assert stream_from_stream is array_stream
 
     # With requested_schema should go through capsule
-    array_stream = CArrayStream.from_array_list([], na.c_schema(na.int32()))
+    array_stream = CArrayStream.from_c_arrays([], na.c_schema(na.int32()))
     with pytest.raises(NotImplementedError):
         na.c_array_stream(array_stream, na.int64())
 
@@ -43,7 +42,7 @@ def test_c_array_stream_from_capsule_protocol():
         def __arrow_c_stream__(self, *args, **kwargs):
             return self.obj.__arrow_c_stream__(*args, **kwargs)
 
-    array_stream = CArrayStream.from_array_list([], na.c_schema(na.int32()))
+    array_stream = CArrayStream.from_c_arrays([], na.c_schema(na.int32()))
     array_stream_wrapper = CArrayStreamWrapper(array_stream)
     from_protocol = na.c_array_stream(array_stream_wrapper)
     assert array_stream.is_valid() is False
@@ -70,14 +69,14 @@ def test_c_array_stream_from_old_pyarrow():
 
 
 def test_c_array_stream_from_bare_capsule():
-    array_stream = CArrayStream.from_array_list([], na.c_schema(na.int32()))
+    array_stream = CArrayStream.from_c_arrays([], na.c_schema(na.int32()))
 
     # Check from bare capsule without supplying a schema
     capsule = array_stream.__arrow_c_stream__()
     from_capsule = na.c_array_stream(capsule)
     assert from_capsule.get_schema().format == "i"
 
-    array_stream = CArrayStream.from_array_list([], na.c_schema(na.int32()))
+    array_stream = CArrayStream.from_c_arrays([], na.c_schema(na.int32()))
     capsule = array_stream.__arrow_c_stream__()
 
     with pytest.raises(TypeError, match="Can't import c_array_stream"):
@@ -109,30 +108,30 @@ def test_c_array_stream_error():
 def test_array_stream_from_arrays_schema():
     schema_in = na.c_schema(na.int32())
 
-    stream = CArrayStream.from_array_list([], schema_in)
+    stream = CArrayStream.from_c_arrays([], schema_in)
     assert schema_in.is_valid()
     assert list(stream) == []
     assert stream.get_schema().format == "i"
 
     # Check move of schema
-    CArrayStream.from_array_list([], schema_in, move=True)
+    CArrayStream.from_c_arrays([], schema_in, move=True)
     assert schema_in.is_valid() is False
     assert stream.get_schema().format == "i"
 
 
 def test_array_stream_from_arrays():
     schema_in = na.c_schema(na.int32())
-    array_in = na.c_array([1, 2, 3], schema_in)
+    array_in = na.c_array([1, 2, 3], na.int32())
     array_in_buffers = array_in.buffers
 
-    stream = CArrayStream.from_array_list([array_in], schema_in)
+    stream = CArrayStream.from_c_arrays([array_in], schema_in)
     assert array_in.is_valid()
     arrays = list(stream)
     assert len(arrays) == 1
     assert arrays[0].buffers == array_in_buffers
 
     # Check move of array
-    stream = CArrayStream.from_array_list([array_in], schema_in, move=True)
+    stream = CArrayStream.from_c_arrays([array_in], schema_in, move=True)
     assert array_in.is_valid() is False
     arrays = list(stream)
     assert len(arrays) == 1
@@ -144,12 +143,11 @@ def test_array_stream_from_arrays_validate():
     array_in = na.c_array([1, 2, 3], na.int32())
 
     # Check that we can skip validation and proceed without error
-    stream = CArrayStream.from_array_list([array_in], schema_in, 
validate=False)
+    stream = CArrayStream.from_c_arrays([array_in], schema_in, validate=False)
     arrays = list(stream)
     assert len(arrays) == 1
     assert arrays[0].n_buffers == 2
 
     # ...but that validation does happen by default
-    msg = "Expected array with 0 buffer"
-    with pytest.raises(NanoarrowException, match=msg):
-        CArrayStream.from_array_list([array_in], schema_in)
+    with pytest.raises(ValueError, match="Expected schema"):
+        CArrayStream.from_c_arrays([array_in], schema_in)
diff --git a/python/tests/test_c_schema.py b/python/tests/test_c_schema.py
index 74c69bd7..f70f49ab 100644
--- a/python/tests/test_c_schema.py
+++ b/python/tests/test_c_schema.py
@@ -127,6 +127,70 @@ def test_c_schema_metadata():
     assert view.extension_metadata == b"some_metadata"
 
 
+def test_c_schema_equals():
+    int32 = na.c_schema(na.int32())
+    struct = na.c_schema(na.struct({"col1": na.int32()}))
+    dictionary = na.c_schema(na.dictionary(na.int32(), na.string()))
+    ordered_dictionary = na.c_schema(
+        na.dictionary(na.int32(), na.string(), dictionary_ordered=True)
+    )
+
+    # Check schemas pointing to the same ArrowSchema
+    assert int32.type_equals(int32)
+
+    # Check equality with deep copies
+    assert int32.type_equals(int32.__deepcopy__())
+    assert struct.type_equals(struct.__deepcopy__())
+    assert dictionary.type_equals(dictionary.__deepcopy__())
+
+    # Check inequality because of format
+    assert int32.type_equals(struct) is False
+
+    # Check inequality because of nullability
+    assert int32.type_equals(int32.modify(flags=0), check_nullability=True) is 
False
+    # ...but not by default
+    assert int32.type_equals(int32.modify(flags=0)) is True
+
+    # Check inequality of type information encoded in flags
+    assert dictionary.type_equals(ordered_dictionary) is False
+
+    # Check inequality because of number of children
+    assert struct.type_equals(struct.modify(children=[])) is False
+
+    # Check inequality because of a difference in the children
+    assert struct.type_equals(struct.modify(children=[dictionary])) is False
+
+    # Check inequality because of dictionary presence
+    assert int32.type_equals(dictionary) is False
+    assert dictionary.type_equals(int32) is False
+
+    # Check inequality because of dictionary index type
+    assert (
+        dictionary.type_equals(na.c_schema(na.dictionary(na.int64(), 
na.string())))
+        is False
+    )
+
+    # Check inequality because of dictionary value type
+    assert dictionary.type_equals(dictionary.modify(dictionary=struct)) is 
False
+
+
+def test_c_schema_assert_type_equal():
+    from nanoarrow._lib import assert_type_equal
+
+    int32 = na.c_schema(na.int32())
+    string = na.c_schema(na.string())
+
+    with pytest.raises(TypeError):
+        assert_type_equal(None, int32)
+
+    with pytest.raises(TypeError):
+        assert_type_equal(int32, None)
+
+    msg = "Expected schema\n  'string'\nbut got\n  'int32'"
+    with pytest.raises(ValueError, match=msg):
+        assert_type_equal(int32, string)
+
+
 def test_c_schema_modify():
     schema = na.c_schema(na.null())
 
diff --git a/python/tests/test_capsules.py b/python/tests/test_capsules.py
index 2cf7fbf4..aa5b1743 100644
--- a/python/tests/test_capsules.py
+++ b/python/tests/test_capsules.py
@@ -74,7 +74,7 @@ def test_array():
         array = na.c_array(arr_obj)
         # some basic validation
         assert array.is_valid()
-        assert array.length == 3
+        assert len(array) == 3
         assert array.schema._to_string(recursive=True) == "int32"
 
         # roundtrip
@@ -98,7 +98,7 @@ def test_array_stream():
         # some basic validation
         assert array_stream.is_valid()
         array = array_stream.get_next()
-        assert array.length == 3
+        assert len(array) == 3
         assert (
             array_stream.get_schema()._to_string(recursive=True)
             == "struct<some_column: int32>"
diff --git a/python/tests/test_device.py b/python/tests/test_device.py
index 1158337a..09d897a5 100644
--- a/python/tests/test_device.py
+++ b/python/tests/test_device.py
@@ -43,12 +43,12 @@ def test_c_device_array():
 
     assert darray.schema.format == "i"
 
-    assert darray.array.length == 3
+    assert len(darray.array) == 3
     assert darray.array.device_type == device.cpu().device_type
     assert darray.array.device_id == device.cpu().device_id
 
     darray_view = darray.view()
-    assert darray_view.length == 3
+    assert len(darray_view) == 3
     assert list(darray_view.buffer(1)) == [1, 2, 3]
 
     # A CDeviceArray should be returned as is
@@ -75,7 +75,7 @@ def test_c_device_array_protocol():
 
     darray2 = device.c_device_array(wrapper)
     assert darray2.schema.format == "i"
-    assert darray2.array.length == 3
+    assert len(darray2.array) == 3
     assert darray2.array.buffers == darray.array.buffers
 
     with pytest.raises(NotImplementedError):
diff --git a/python/tests/test_ipc.py b/python/tests/test_ipc.py
index 6d281020..b9d15321 100644
--- a/python/tests/test_ipc.py
+++ b/python/tests/test_ipc.py
@@ -62,7 +62,7 @@ def test_ipc_stream_from_readable():
             with na.c_array_stream(input) as stream:
                 batches = list(stream)
                 assert len(batches) == 1
-                assert batches[0].length == 3
+                assert len(batches[0]) == 3
 
 
 def test_ipc_stream_from_path():
@@ -76,7 +76,7 @@ def test_ipc_stream_from_path():
             with na.c_array_stream(input) as stream:
                 batches = list(stream)
                 assert len(batches) == 1
-                assert batches[0].length == 3
+                assert len(batches[0]) == 3
 
 
 def test_ipc_stream_from_url():
@@ -90,7 +90,7 @@ def test_ipc_stream_from_url():
             with na.c_array_stream(input) as stream:
                 batches = list(stream)
                 assert len(batches) == 1
-                assert batches[0].length == 3
+                assert len(batches[0]) == 3
 
 
 def test_ipc_stream_python_exception_on_read():
diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py
index e138ecba..eccd2378 100644
--- a/python/tests/test_nanoarrow.py
+++ b/python/tests/test_nanoarrow.py
@@ -89,6 +89,7 @@ def test_c_array():
     array = na.c_array(pa.array([1, 2, 3], pa.int32()))
     assert array.is_valid() is True
     assert array.length == 3
+    assert len(array) == 3
     assert array.offset == 0
     assert array.null_count == 0
     assert array.n_buffers == 2

(arrow-nanoarrow) branch main updated: feat(python): Add `Array.from_chunks()` constructor (#456)

Reply via email to