This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 28e81237 refactor(python): Reorganize strategies for building arrays
(#444)
28e81237 is described below
commit 28e81237ebe491efac416f85b238c89c0bdaa97b
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Apr 30 16:10:38 2024 -0300
refactor(python): Reorganize strategies for building arrays (#444)
This PR reorganizes the implementation for `na.c_array(<something>)`.
The motivation here is that there are some Arrow types that aren't
supported when creating Arrow arrays (e.g., decimal), and some container
types that are not supported (e.g., data frame protocol). I don't think
those should be supported right now but I would like there to be an
obvious route to adding support. The `ArrayBuilder` class and the
dispatch code that chooses the appropriate class/method is probably not
perfect but does provide a more reasonable path to adding support for a
new type (add a method to the `ArrayFromIterableBuilder` class) or new
container (add a new `ArrayBuilder` subclass).
In order to avoid a circular import, this would all have had to be added
to `c_lib.py`. Instead of doing a lazy import, I separated `c_lib.py`
into `c_schema.py`, `c_buffer.py`, `c_array.py`, and
`c_array_stream.py`. The tests for these functions had already been
separated and I quite like the organization (reasonably obvious which
file you might find the `c_array()` function in, for example).
---
python/src/nanoarrow/__init__.py | 17 +-
python/src/nanoarrow/_lib.pyx | 67 +++-
python/src/nanoarrow/array.py | 3 +-
python/src/nanoarrow/array_stream.py | 2 +-
python/src/nanoarrow/c_array.py | 579 +++++++++++++++++++++++++++
python/src/nanoarrow/c_array_stream.py | 124 ++++++
python/src/nanoarrow/c_buffer.py | 124 ++++++
python/src/nanoarrow/c_lib.py | 710 ---------------------------------
python/src/nanoarrow/c_schema.py | 123 ++++++
python/src/nanoarrow/device.py | 3 +-
python/src/nanoarrow/iterator.py | 3 +-
python/src/nanoarrow/schema.py | 2 +-
python/tests/test_array.py | 2 +-
python/tests/test_c_array.py | 39 +-
python/tests/test_c_array_stream.py | 4 +-
python/tests/test_c_buffer.py | 4 +-
python/tests/test_c_buffer_view.py | 5 +-
python/tests/test_c_schema_view.py | 2 +-
python/tests/test_capsules.py | 16 +-
python/tests/test_nanoarrow.py | 12 +-
20 files changed, 1076 insertions(+), 765 deletions(-)
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index c73f60e9..c1cd12dd 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -25,18 +25,19 @@ Arrow C Data and Arrow C Stream interfaces.
"""
from nanoarrow._lib import c_version
-from nanoarrow.c_lib import (
- c_schema,
- c_array,
+from nanoarrow.c_array import (
c_array_from_buffers,
- c_array_stream,
- c_schema_view,
+ c_array,
c_array_view,
- c_buffer,
- allocate_c_schema,
allocate_c_array,
- allocate_c_array_stream,
)
+from nanoarrow.c_array_stream import c_array_stream, allocate_c_array_stream
+from nanoarrow.c_schema import (
+ c_schema,
+ c_schema_view,
+ allocate_c_schema,
+)
+from nanoarrow.c_buffer import c_buffer
from nanoarrow.schema import (
Schema,
Type,
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 752a009c..99127087 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -858,22 +858,48 @@ cdef class CSchemaView:
def storage_type_id(self):
return self._schema_view.storage_type
+ @property
+ def buffer_format(self):
+ """The Python struct format representing an element of this type
+ or None if there is no Python format string that can represent this
+ type.
+ """
+ if self.extension_name or self._schema_view.type !=
self._schema_view.storage_type:
+ return None
+
+ cdef char out[128]
+ cdef int element_size_bits = 0
+ if self._schema_view.type == NANOARROW_TYPE_FIXED_SIZE_BINARY:
+ element_size_bits = self._schema_view.fixed_size * 8
+
+ try:
+ c_format_from_arrow_type(self._schema_view.type,
element_size_bits, sizeof(out), out)
+ return out.decode()
+ except ValueError:
+ return None
+
@property
def type(self):
cdef const char* type_str = ArrowTypeString(self._schema_view.type)
if type_str != NULL:
- return type_str.decode('UTF-8')
+ return type_str.decode()
+ else:
+ raise ValueError("ArrowTypeString() returned NULL")
@property
def storage_type(self):
cdef const char* type_str =
ArrowTypeString(self._schema_view.storage_type)
if type_str != NULL:
- return type_str.decode('UTF-8')
+ return type_str.decode()
+ else:
+ raise ValueError("ArrowTypeString() returned NULL")
@property
def dictionary_ordered(self):
if self._schema_view.type == NANOARROW_TYPE_DICTIONARY:
return self._dictionary_ordered != 0
+ else:
+ return None
@property
def nullable(self):
@@ -883,47 +909,65 @@ cdef class CSchemaView:
def map_keys_sorted(self):
if self._schema_view.type == NANOARROW_TYPE_MAP:
return self._map_keys_sorted != 0
+ else:
+ return None
@property
def fixed_size(self):
if self._schema_view.type in CSchemaView._fixed_size_types:
return self._schema_view.fixed_size
+ else:
+ return None
@property
def decimal_bitwidth(self):
if self._schema_view.type in CSchemaView._decimal_types:
return self._schema_view.decimal_bitwidth
+ else:
+ return None
@property
def decimal_precision(self):
if self._schema_view.type in CSchemaView._decimal_types:
return self._schema_view.decimal_precision
+ else:
+ return None
@property
def decimal_scale(self):
if self._schema_view.type in CSchemaView._decimal_types:
return self._schema_view.decimal_scale
+ else:
+ return None
@property
def time_unit_id(self):
if self._schema_view.type in CSchemaView._time_unit_types:
return self._schema_view.time_unit
+ else:
+ return None
@property
def time_unit(self):
if self._schema_view.type in CSchemaView._time_unit_types:
- return
ArrowTimeUnitString(self._schema_view.time_unit).decode('UTF-8')
+ return ArrowTimeUnitString(self._schema_view.time_unit).decode()
+ else:
+ return None
@property
def timezone(self):
if self._schema_view.type == NANOARROW_TYPE_TIMESTAMP:
- return self._schema_view.timezone.decode('UTF_8')
+ return self._schema_view.timezone.decode()
+ else:
+ return None
@property
def union_type_ids(self):
if self._schema_view.type in CSchemaView._union_types:
- type_ids_str =
self._schema_view.union_type_ids.decode('UTF-8').split(',')
+ type_ids_str = self._schema_view.union_type_ids.decode().split(',')
return (int(type_id) for type_id in type_ids_str)
+ else:
+ return None
@property
def extension_name(self):
@@ -932,7 +976,9 @@ cdef class CSchemaView:
self._schema_view.extension_name.data,
self._schema_view.extension_name.size_bytes
)
- return name_bytes.decode('UTF-8')
+ return name_bytes.decode()
+ else:
+ return None
@property
def extension_metadata(self):
@@ -941,7 +987,8 @@ cdef class CSchemaView:
self._schema_view.extension_metadata.data,
self._schema_view.extension_metadata.size_bytes
)
-
+ else:
+ return None
def __repr__(self):
return _repr_utils.schema_view_repr(self)
@@ -2275,6 +2322,12 @@ cdef class CArrayBuilder:
def allocate():
return CArrayBuilder(CArray.allocate(CSchema.allocate()))
+ def is_empty(self):
+ if self._ptr.release == NULL:
+ raise RuntimeError("CArrayBuilder is not initialized")
+
+ return self._ptr.length == 0
+
def init_from_type(self, int type_id):
if self._ptr.release != NULL:
raise RuntimeError("CArrayBuilder is already initialized")
diff --git a/python/src/nanoarrow/array.py b/python/src/nanoarrow/array.py
index 388fc656..f6aa4510 100644
--- a/python/src/nanoarrow/array.py
+++ b/python/src/nanoarrow/array.py
@@ -26,7 +26,8 @@ from nanoarrow._lib import (
CMaterializedArrayStream,
Device,
)
-from nanoarrow.c_lib import c_array, c_array_stream, c_array_view
+from nanoarrow.c_array import c_array, c_array_view
+from nanoarrow.c_array_stream import c_array_stream
from nanoarrow.iterator import iter_array_views, iter_py, iter_tuples
from nanoarrow.schema import Schema
diff --git a/python/src/nanoarrow/array_stream.py
b/python/src/nanoarrow/array_stream.py
index 1f03dcc2..6fa1e0f2 100644
--- a/python/src/nanoarrow/array_stream.py
+++ b/python/src/nanoarrow/array_stream.py
@@ -21,7 +21,7 @@ from typing import Iterable, Tuple
from nanoarrow._lib import CMaterializedArrayStream
from nanoarrow._repr_utils import make_class_label
from nanoarrow.array import Array
-from nanoarrow.c_lib import c_array_stream
+from nanoarrow.c_array_stream import c_array_stream
from nanoarrow.iterator import iter_py, iter_tuples
from nanoarrow.schema import Schema
diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
new file mode 100644
index 00000000..10849d53
--- /dev/null
+++ b/python/src/nanoarrow/c_array.py
@@ -0,0 +1,579 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, Iterable, Literal, Tuple
+
+from nanoarrow._lib import (
+ CArray,
+ CArrayBuilder,
+ CArrayView,
+ CArrowType,
+ CBuffer,
+ CBufferBuilder,
+ CSchema,
+ CSchemaBuilder,
+ NoneAwareWrapperIterator,
+ _obj_is_buffer,
+ _obj_is_capsule,
+)
+from nanoarrow.c_buffer import c_buffer
+from nanoarrow.c_schema import c_schema, c_schema_view
+
+
+def c_array(obj, schema=None) -> CArray:
+ """ArrowArray wrapper
+
+ This class provides a user-facing interface to access the fields of an
ArrowArray
+ as defined in the Arrow C Data interface, holding an optional reference to
a
+ :class:`CSchema` that can be used to safely deserialize the content.
+
+ These objects are created using :func:`c_array`, which accepts any
array-like
+ object according to the Arrow PyCapsule interface, Python buffer protocol,
+ or iterable of Python objects.
+
+ This Python wrapper allows access to array fields but does not
automatically
+ deserialize their content: use :func:`c_array_view` to validate and
deserialize
+ the content into a more easily inspectable object.
+
+ Note that the :class:`CArray` objects returned by ``.child()`` hold strong
+ references to the original ``ArrowArray`` to avoid copies while inspecting
an
+ imported structure.
+
+ Parameters
+ ----------
+ obj : array-like
+ An object supporting the Arrow PyCapsule interface, the Python buffer
+ protocol, or an iterable of Python objects.
+ schema : schema-like or None
+ A schema-like object as sanitized by :func:`c_schema` or None. This
value
+ will be used to request a data type from ``obj``; however, the
conversion
+ is best-effort (i.e., the data type of the returned ``CArray`` may be
+ different than ``schema``).
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> # Create from iterable
+ >>> array = na.c_array([1, 2, 3], na.int32())
+ >>> # Create from Python buffer (e.g., numpy array)
+ >>> import numpy as np
+ >>> array = na.c_array(np.array([1, 2, 3]))
+ >>> # Create from Arrow PyCapsule (e.g., pyarrow array)
+ >>> import pyarrow as pa
+ >>> array = na.c_array(pa.array([1, 2, 3]))
+ >>> # Access array fields
+ >>> array.length
+ 3
+ >>> array.null_count
+ 0
+ """
+
+ if schema is not None:
+ schema = c_schema(schema)
+
+ if isinstance(obj, CArray) and schema is None:
+ return obj
+
+ # Try Arrow PyCapsule protocol
+ if hasattr(obj, "__arrow_c_array__"):
+ schema_capsule = None if schema is None else
schema.__arrow_c_schema__()
+ return CArray._import_from_c_capsule(
+ *obj.__arrow_c_array__(requested_schema=schema_capsule)
+ )
+
+ # Try import of bare capsule
+ if _obj_is_capsule(obj, "arrow_array"):
+ if schema is None:
+ schema_capsule = CSchema.allocate()._capsule
+ else:
+ schema_capsule = schema.__arrow_c_schema__()
+
+ return CArray._import_from_c_capsule(schema_capsule, obj)
+
+ # Try _export_to_c for Array/RecordBatch objects if pyarrow < 14.0
+ if _obj_is_pyarrow_array(obj):
+ out = CArray.allocate(CSchema.allocate())
+ obj._export_to_c(out._addr(), out.schema._addr())
+ return out
+
+ # Use the ArrayBuilder classes to handle various strategies for other
+ # types of objects (e.g., iterable, pybuffer, empty).
+ try:
+ builder_cls = _resolve_builder(obj)
+ except Exception as e:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} "
+ f"to nanoarrow.c_array: \n {e}"
+ ) from e
+
+ try:
+ if schema is None:
+ obj, schema = builder_cls.infer_schema(obj)
+
+ builder = builder_cls(schema)
+ return builder.build_c_array(obj)
+ except Exception as e:
+ raise ValueError(
+ f"An error occurred whilst converting {type(obj).__name__} "
+ f"to nanoarrow.c_array: \n {e}"
+ ) from e
+
+
+def _resolve_builder(obj):
+ if _obj_is_empty(obj):
+ return EmptyArrayBuilder
+
+ if _obj_is_buffer(obj):
+ return ArrayFromPyBufferBuilder
+
+ if _obj_is_iterable(obj):
+ return ArrayFromIterableBuilder
+
+ raise TypeError(
+ f"Can't resolve ArrayBuilder for object of type {type(obj).__name__}"
+ )
+
+
+def allocate_c_array(schema=None) -> CArray:
+ """Allocate an uninitialized ArrowArray
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.allocate_c_schema()
+ >>> pa.int32()._export_to_c(schema._addr())
+ """
+ if schema is not None:
+ schema = c_schema(schema)
+
+ return CArray.allocate(CSchema.allocate() if schema is None else schema)
+
+
+def c_array_view(obj, schema=None) -> CArrayView:
+ """ArrowArrayView wrapper
+
+ The ``ArrowArrayView`` is a nanoarrow C library structure that provides
+ structured access to buffers addresses, buffer sizes, and buffer
+ data types. The buffer data is usually propagated from an ArrowArray
+ but can also be propagated from other types of objects (e.g., serialized
+ IPC). The offset and length of this view are independent of its parent
+ (i.e., this object can also represent a slice of its parent).
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import numpy as np
+ >>> import nanoarrow as na
+ >>> array = na.c_array(pa.array(["one", "two", "three", None]))
+ >>> array_view = na.c_array_view(array)
+ >>> np.array(array_view.buffer(1))
+ array([ 0, 3, 6, 11, 11], dtype=int32)
+ >>> np.array(array_view.buffer(2))
+ array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
+ dtype='|S1')
+ """
+
+ if isinstance(obj, CArrayView) and schema is None:
+ return obj
+
+ return c_array(obj, schema).view()
+
+
+def c_array_from_buffers(
+ schema,
+ length: int,
+ buffers: Iterable[Any],
+ null_count: int = -1,
+ offset: int = 0,
+ children: Iterable[Any] = (),
+ validation_level: Literal[None, "full", "default", "minimal", "none"] =
None,
+ move: bool = False,
+) -> CArray:
+ """Create an ArrowArray wrapper from components
+
+ Given a schema, build an ArrowArray buffer-wise. This allows almost any
array
+ to be assembled; however, requires some knowledge of the Arrow Columnar
+ specification. This function will do its best to validate the sizes and
+ content of buffers according to ``validation_level``; however, not all
+ types of arrays can currently be validated when constructed in this way.
+
+ Parameters
+ ----------
+ schema : schema-like
+ The data type of the desired array as sanitized by :func:`c_schema`.
+ length : int
+ The length of the output array.
+ buffers : Iterable of buffer-like or None
+ An iterable of buffers as sanitized by :func:`c_buffer`. Any object
+ supporting the Python Buffer protocol is accepted. Buffer data types
+ are not checked. A buffer value of ``None`` will skip setting a buffer
+ (i.e., that buffer will be of length zero and its pointer will
+ be ``NULL``).
+ null_count : int, optional
+ The number of null values, if known in advance. If -1 (the default),
+ the null count will be calculated based on the validity bitmap. If
+ the validity bitmap was set to ``None``, the calculated null count
+ will be zero.
+ offset : int, optional
+ The logical offset from the start of the array.
+ children : Iterable of array-like
+ An iterable of arrays used to set child fields of the array. Can
contain
+ any object accepted by :func:`c_array`. Must contain the exact number
of
+ required children as specifed by ``schema``.
+ validation_level: None or str, optional
+ One of "none" (no check), "minimal" (check buffer sizes that do not
require
+ dereferencing buffer content), "default" (check all buffer sizes), or
"full"
+ (check all buffer sizes and all buffer content). The default, ``None``,
+ will validate at the "default" level where possible.
+ move : bool, optional
+ Use ``True`` to move ownership of any input buffers or children to the
+ output array.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> c_array = na.c_array_from_buffers(na.uint8(), 5, [None, b"12345"])
+ >>> na.c_array_view(c_array)
+ <nanoarrow.c_lib.CArrayView>
+ - storage_type: 'uint8'
+ - length: 5
+ - offset: 0
+ - null_count: 0
+ - buffers[2]:
+ - validity <bool[0 b] >
+ - data <uint8[5 b] 49 50 51 52 53>
+ - dictionary: NULL
+ - children[0]:
+ """
+ schema = c_schema(schema)
+ builder = CArrayBuilder.allocate()
+
+ # Ensures that the output array->n_buffers is set and that the correct
number
+ # of children have been initialized.
+ builder.init_from_schema(schema)
+
+ # Set buffers, optionally moving ownership of the buffers as well (i.e.,
+ # the objects in the input buffers would be replaced with an empty
ArrowBuffer)
+ for i, buffer in enumerate(buffers):
+ if buffer is None:
+ continue
+
+ # If we're setting a CBuffer from something else, we can avoid an extra
+ # level of Python wrapping by using move=True
+ move = move or not isinstance(buffer, CBuffer)
+ builder.set_buffer(i, c_buffer(buffer), move=move)
+
+ # Set children, optionally moving ownership of the children as well (i.e.,
+ # the objects in the input children would be marked released).
+ n_children = 0
+ for child_src in children:
+ # If we're setting a CArray from something else, we can avoid an extra
+ # level of Python wrapping by using move=True
+ move = move or not isinstance(child_src, CArray)
+ builder.set_child(n_children, c_array(child_src), move=move)
+ n_children += 1
+
+ if n_children != schema.n_children:
+ raise ValueError(f"Expected {schema.n_children} children but got
{n_children}")
+
+ # Set array fields
+ builder.set_length(length)
+ builder.set_offset(offset)
+ builder.set_null_count(null_count)
+
+ # Calculates the null count if -1 (and if applicable)
+ builder.resolve_null_count()
+
+ # Validate + finish
+ return builder.finish(validation_level=validation_level)
+
+
+class ArrayBuilder:
+ """Internal utility to build CArrays from various types of input
+
+ This class and its subclasses are designed to help separate the code
+ that actually builds a CArray from the code that chooses the strategy
+ used to do the building.
+ """
+
+ @classmethod
+ def infer_schema(cls, obj) -> Tuple[CSchema, Any]:
+ """Infer the Arrow data type from a target object
+
+ Returns the type as a :class:`CSchema` and an object that can be
+ consumed in the same way by append() in the event it had to be
+ modified to infer its type (e.g., for an iterable, it would be
+ necessary to consume the first element from the original iterator).
+ """
+ raise NotImplementedError()
+
+ def __init__(self, schema):
+ self._schema = c_schema(schema)
+ self._schema_view = c_schema_view(self._schema)
+ self._c_builder = CArrayBuilder.allocate()
+ self._c_builder.init_from_schema(self._schema)
+
+ def build_c_array(self, obj):
+ self.start_building()
+ self.append(obj)
+ return self.finish_building()
+
+ def start_building(self) -> None:
+ pass
+
+ def append(self, obj: Any) -> None:
+ raise NotImplementedError()
+
+ def finish_building(self) -> CArray:
+ return self._c_builder.finish()
+
+
+class EmptyArrayBuilder(ArrayBuilder):
+ """Build an empty CArray of any type
+
+ This builder accepts any empty input and produces a valid length zero
+ array as output.
+ """
+
+ @classmethod
+ def infer_schema(cls, obj) -> Tuple[Any, CSchema]:
+ return obj, CSchemaBuilder.allocate().set_type(CArrowType.NA)
+
+ def start_building(self) -> None:
+ self._c_builder.start_appending()
+
+ def append(self, obj: Any) -> None:
+ if len(obj) != 0:
+ raise ValueError(
+ f"Can't build empty array from {type(obj).__name__} "
+ f"with length {len(obj)}"
+ )
+
+
+class ArrayFromPyBufferBuilder(ArrayBuilder):
+ """Build a CArray from a Python Buffer
+
+ This builder converts a Python buffer (e.g., numpy array, bytes,
array.array)
+ to a CArray (without copying the contents of the buffer).
+ """
+
+ @classmethod
+ def infer_schema(cls, obj) -> Tuple[CBuffer, CSchema]:
+ if not isinstance(obj, CBuffer):
+ obj = CBuffer.from_pybuffer(obj)
+
+ type_id = obj.data_type_id
+ element_size_bits = obj.element_size_bits
+
+ # Fixed-size binary needs a schema
+ if type_id == CArrowType.BINARY and element_size_bits != 0:
+ schema = (
+ CSchemaBuilder.allocate()
+ .set_type_fixed_size(
+ CArrowType.FIXED_SIZE_BINARY, element_size_bits // 8
+ )
+ .finish()
+ )
+ elif type_id == CArrowType.STRING:
+ schema =
CSchemaBuilder.allocate().set_type(CArrowType.INT8).finish()
+ elif type_id == CArrowType.BINARY:
+ schema =
CSchemaBuilder.allocate().set_type(CArrowType.UINT8).finish()
+ else:
+ schema = CSchemaBuilder.allocate().set_type(type_id).finish()
+
+ return obj, schema
+
+ def __init__(self, schema):
+ super().__init__(schema)
+
+ if self._schema_view.buffer_format is None:
+ raise ValueError(
+ f"Can't build array of type {self._schema_view.type} from
PyBuffer"
+ )
+
+ def append(self, obj: Any) -> None:
+ if not self._c_builder.is_empty():
+ raise ValueError("Can't append to non-empty
ArrayFromPyBufferBuilder")
+
+ if not isinstance(obj, CBuffer):
+ obj = CBuffer.from_pybuffer(obj)
+
+ if (
+ self._schema_view.buffer_format in ("b", "c")
+ and obj.format not in ("b", "c")
+ ) and self._schema_view.buffer_format != obj.format:
+ raise ValueError(
+ f"Expected buffer with format
'{self._schema_view.buffer_format}' "
+ f"but got buffer with format '{obj.format}'"
+ )
+
+ self._c_builder.set_buffer(1, obj)
+ self._c_builder.set_length(len(obj))
+ self._c_builder.set_null_count(0)
+ self._c_builder.set_offset(0)
+
+
+class ArrayFromIterableBuilder(ArrayBuilder):
+ """Build a CArray from an iterable of scalar objects
+
+ This builder converts an iterable to a CArray using some heuristics to pick
+ the fastest available method for converting to a particular type of array.
+ Briefly, the methods are (1) ArrowArrayAppendXXX() functions from the C
+ library (string, binary), (2) array.array() (integer/float except float16),
+ (3) CBufferBuilder.write_elements() (everything else).
+ """
+
+ @classmethod
+ def infer_schema(cls, obj) -> Tuple[CBuffer, CSchema]:
+ raise ValueError("schema is required to build array from iterable")
+
+ def __init__(self, schema):
+ super().__init__(schema)
+
+ # Resolve the method name we are going to use to do the building from
+ # the provided schema.
+ type_id = self._schema_view.type_id
+ if type_id not in _ARRAY_BUILDER_FROM_ITERABLE_METHOD:
+ raise ValueError(
+ f"Can't build array of type {self._schema_view.type} from
iterable"
+ )
+
+ method_name = _ARRAY_BUILDER_FROM_ITERABLE_METHOD[type_id]
+
+ # If there might be nulls, we may need to pick a different strategy
+ if (
+ self._schema_view.nullable
+ and method_name in _ARRAY_BUILDER_FROM_NULLABLE_ITERABLE_METHOD
+ ):
+ method_name =
_ARRAY_BUILDER_FROM_NULLABLE_ITERABLE_METHOD[method_name]
+
+ self._append_impl = getattr(self, method_name)
+
+ def start_building(self) -> None:
+ self._c_builder.start_appending()
+
+ def append(self, obj: Any) -> None:
+ self._append_impl(obj)
+
+ def _append_strings(self, obj: Iterable) -> None:
+ self._c_builder.append_strings(obj)
+
+ def _append_bytes(self, obj: Iterable) -> None:
+ self._c_builder.append_bytes(obj)
+
+ def _build_nullable_array_using_array(self, obj: Iterable) -> None:
+ wrapper = NoneAwareWrapperIterator(
+ obj, self._schema_view.storage_type_id,
self._schema_view.fixed_size
+ )
+ self._append_using_array(wrapper)
+
+ _, null_count, validity = wrapper.finish()
+ if validity is not None:
+ self._c_builder.set_buffer(0, validity, move=True)
+
+ self._c_builder.set_null_count(null_count)
+
+ def _build_nullable_array_using_buffer_builder(self, obj: Iterable) ->
None:
+ wrapper = NoneAwareWrapperIterator(
+ obj, self._schema_view.storage_type_id,
self._schema_view.fixed_size
+ )
+ self._append_using_buffer_builder(wrapper)
+
+ _, null_count, validity = wrapper.finish()
+ if validity is not None:
+ self._c_builder.set_buffer(0, validity, move=True)
+
+ self._c_builder.set_null_count(null_count)
+
+ def _append_using_array(self, obj: Iterable) -> None:
+ from array import array
+
+ py_array = array(self._schema_view.buffer_format, obj)
+ buffer = CBuffer.from_pybuffer(py_array)
+ self._c_builder.set_buffer(1, buffer, move=True)
+ self._c_builder.set_length(len(buffer))
+ self._c_builder.set_null_count(0)
+ self._c_builder.set_offset(0)
+
+ def _append_using_buffer_builder(self, obj: Iterable) -> None:
+ builder = CBufferBuilder()
+ builder.set_data_type(self._schema_view.type_id)
+
+ n_values = builder.write_elements(obj)
+
+ buffer = builder.finish()
+ self._c_builder.set_buffer(1, buffer, move=True)
+ self._c_builder.set_length(n_values)
+ self._c_builder.set_null_count(0)
+ self._c_builder.set_offset(0)
+
+
+_ARRAY_BUILDER_FROM_ITERABLE_METHOD = {
+ CArrowType.BOOL: "_append_using_buffer_builder",
+ CArrowType.HALF_FLOAT: "_append_using_buffer_builder",
+ CArrowType.INTERVAL_MONTH_DAY_NANO: "_append_using_buffer_builder",
+ CArrowType.INTERVAL_DAY_TIME: "_append_using_buffer_builder",
+ CArrowType.INTERVAL_MONTHS: "_append_using_buffer_builder",
+ CArrowType.BINARY: "_append_bytes",
+ CArrowType.LARGE_BINARY: "_append_bytes",
+ CArrowType.FIXED_SIZE_BINARY: "_append_bytes",
+ CArrowType.STRING: "_append_strings",
+ CArrowType.LARGE_STRING: "_append_strings",
+ CArrowType.INT8: "_append_using_array",
+ CArrowType.UINT8: "_append_using_array",
+ CArrowType.INT16: "_append_using_array",
+ CArrowType.UINT16: "_append_using_array",
+ CArrowType.INT32: "_append_using_array",
+ CArrowType.UINT32: "_append_using_array",
+ CArrowType.INT64: "_append_using_array",
+ CArrowType.UINT64: "_append_using_array",
+ CArrowType.FLOAT: "_append_using_array",
+ CArrowType.DOUBLE: "_append_using_array",
+}
+
+_ARRAY_BUILDER_FROM_NULLABLE_ITERABLE_METHOD = {
+ "_append_using_array": "_build_nullable_array_using_array",
+ "_append_using_buffer_builder":
"_build_nullable_array_using_buffer_builder",
+}
+
+
+def _obj_is_iterable(obj):
+ return hasattr(obj, "__iter__")
+
+
+def _obj_is_empty(obj):
+ return hasattr(obj, "__len__") and len(obj) == 0
+
+
+# This is a heuristic for detecting a pyarrow.Array or pyarrow.RecordBatch
+# for pyarrow < 14.0.0, after which the the __arrow_c_array__ protocol
+# is sufficient to detect such an array. This check can't use isinstance()
+# to avoid importing pyarrow unnecessarily.
+def _obj_is_pyarrow_array(obj):
+ obj_type = type(obj)
+ if not obj_type.__module__.startswith("pyarrow"):
+ return False
+
+ if not obj_type.__name__.endswith("Array") and obj_type.__name__ !=
"RecordBatch":
+ return False
+
+ return hasattr(obj, "_export_to_c")
diff --git a/python/src/nanoarrow/c_array_stream.py
b/python/src/nanoarrow/c_array_stream.py
new file mode 100644
index 00000000..b39f96c7
--- /dev/null
+++ b/python/src/nanoarrow/c_array_stream.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from nanoarrow._lib import CArrayStream, _obj_is_capsule
+from nanoarrow.c_array import c_array
+from nanoarrow.c_schema import c_schema
+
+
+def c_array_stream(obj=None, schema=None) -> CArrayStream:
+ """ArrowArrayStream wrapper
+
+ This class provides a user-facing interface to access the fields of
+ an ArrowArrayStream as defined in the Arrow C Stream interface.
+ These objects are usually created using `nanoarrow.c_array_stream()`.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> pa_column = pa.array([1, 2, 3], pa.int32())
+ >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
+ >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
+ >>> array_stream = na.c_array_stream(pa_reader)
+ >>> array_stream.get_schema()
+ <nanoarrow.c_lib.CSchema struct>
+ - format: '+s'
+ - name: ''
+ - flags: 0
+ - metadata: NULL
+ - dictionary: NULL
+ - children[1]:
+ 'col1': <nanoarrow.c_lib.CSchema int32>
+ - format: 'i'
+ - name: 'col1'
+ - flags: 2
+ - metadata: NULL
+ - dictionary: NULL
+ - children[0]:
+ >>> array_stream.get_next().length
+ 3
+ >>> array_stream.get_next() is None
+ Traceback (most recent call last):
+ ...
+ StopIteration
+ """
+
+ if schema is not None:
+ schema = c_schema(schema)
+
+ if isinstance(obj, CArrayStream) and schema is None:
+ return obj
+
+ # Try capsule protocol
+ if hasattr(obj, "__arrow_c_stream__"):
+ schema_capsule = None if schema is None else
schema.__arrow_c_schema__()
+ return CArrayStream._import_from_c_capsule(
+ obj.__arrow_c_stream__(requested_schema=schema_capsule)
+ )
+
+ # Try import of bare capsule
+ if _obj_is_capsule(obj, "arrow_array_stream"):
+ if schema is not None:
+ raise TypeError(
+ "Can't import c_array_stream from capsule with requested
schema"
+ )
+ return CArrayStream._import_from_c_capsule(obj)
+
+ # Try _export_to_c for RecordBatchReader objects if pyarrow < 14.0
+ if _obj_is_pyarrow_record_batch_reader(obj):
+ out = CArrayStream.allocate()
+ obj._export_to_c(out._addr())
+ return out
+
+ try:
+ array = c_array(obj, schema=schema)
+ return CArrayStream.from_array_list([array], array.schema,
validate=False)
+ except Exception as e:
+ raise TypeError(
+ f"An error occurred whilst converting {type(obj).__name__} "
+ f"to nanoarrow.c_array_stream or nanoarrow.c_array: \n {e}"
+ ) from e
+
+
+def allocate_c_array_stream() -> CArrayStream:
+ """Allocate an uninitialized ArrowArrayStream wrapper
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> pa_column = pa.array([1, 2, 3], pa.int32())
+ >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
+ >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
+ >>> array_stream = na.allocate_c_array_stream()
+ >>> pa_reader._export_to_c(array_stream._addr())
+ """
+ return CArrayStream.allocate()
+
+
+def _obj_is_pyarrow_record_batch_reader(obj):
+ obj_type = type(obj)
+ if not obj_type.__module__.startswith("pyarrow"):
+ return False
+
+ if not obj_type.__name__.endswith("RecordBatchReader"):
+ return False
+
+ return hasattr(obj, "_export_to_c")
diff --git a/python/src/nanoarrow/c_buffer.py b/python/src/nanoarrow/c_buffer.py
new file mode 100644
index 00000000..814a5a76
--- /dev/null
+++ b/python/src/nanoarrow/c_buffer.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from nanoarrow._lib import CArrowType, CBuffer, CBufferBuilder, _obj_is_buffer
+from nanoarrow.c_schema import c_schema_view
+
+
+def c_buffer(obj, schema=None) -> CBuffer:
+ """Owning, read-only ArrowBuffer wrapper
+
+ If obj implements the Python buffer protocol, ``c_buffer()`` wraps
+ obj in nanoarrow's owning buffer structure, the ArrowBuffer,
+ such that it can be used to construct arrays. The ownership of the
+ underlying buffer is handled by the Python buffer protocol
+ (i.e., ``PyObject_GetBuffer()`` and ``PyBuffer_Release()``).
+
+ If obj is iterable, a buffer will be allocated and populated with
+ the contents of obj according to ``schema``. The
+ ``schema`` parameter is required to create a buffer from
+ a Python iterable. The ``struct`` module is currently used to encode
+ values from obj into binary form.
+
+ Unlike with :func:`c_array`, ``schema`` is explicitly
+ honoured (or an error will be raised).
+
+ Parameters
+ ----------
+
+ obj : buffer-like or iterable
+ A Python object that supports the Python buffer protocol. This includes
+ bytes, memoryview, bytearray, bulit-in types as well as numpy arrays.
+ schema : schema-like, optional
+ The data type of the desired buffer as sanitized by
+ :func:`c_schema`. Only values that make sense as buffer types are
+ allowed (e.g., integer types, floating-point types, interval types,
+ decimal types, binary, string, fixed-size binary).
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.c_buffer(b"1234")
+ nanoarrow.c_lib.CBuffer(uint8[4 b] 49 50 51 52)
+ >>> na.c_buffer([1, 2, 3], na.int32())
+ nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+ """
+ if isinstance(obj, CBuffer) and schema is None:
+ return obj
+
+ if _obj_is_buffer(obj):
+ if schema is not None:
+ raise NotImplementedError(
+ "c_buffer() with schema for pybuffer is not implemented"
+ )
+ return CBuffer.from_pybuffer(obj)
+
+ if _obj_is_iterable(obj):
+ buffer, _ = _c_buffer_from_iterable(obj, schema)
+ return buffer
+
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_buffer"
+ )
+
+
+def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
+ import array
+
+ # array.typecodes is not available in all PyPy versions.
+ # Rather than guess, just don't use the array constructor if
+ # this attribute is not available.
+ if hasattr(array, "typecodes"):
+ array_typecodes = array.typecodes
+ else:
+ array_typecodes = []
+
+ if schema is None:
+ raise ValueError("CBuffer from iterable requires schema")
+
+ schema_view = c_schema_view(schema)
+ if (
+ schema_view.extension_name is not None
+ or schema_view.storage_type_id != schema_view.type_id
+ ):
+ raise ValueError(
+ f"Can't create buffer from iterable for type {schema_view.type}"
+ )
+
+ builder = CBufferBuilder()
+
+ if schema_view.storage_type_id == CArrowType.FIXED_SIZE_BINARY:
+ builder.set_data_type(CArrowType.BINARY, schema_view.fixed_size * 8)
+ else:
+ builder.set_data_type(schema_view.storage_type_id)
+
+ # If we are using a typecode supported by the array module, it has much
+ # faster implementations of safely building buffers from iterables
+ if (
+ builder.format in array_typecodes
+ and schema_view.storage_type_id != CArrowType.BOOL
+ ):
+ buf = array.array(builder.format, obj)
+ return CBuffer.from_pybuffer(buf), len(buf)
+
+ n_values = builder.write_elements(obj)
+ return builder.finish(), n_values
+
+
+def _obj_is_iterable(obj):
+ return hasattr(obj, "__iter__")
diff --git a/python/src/nanoarrow/c_lib.py b/python/src/nanoarrow/c_lib.py
deleted file mode 100644
index 6c1ff2b8..00000000
--- a/python/src/nanoarrow/c_lib.py
+++ /dev/null
@@ -1,710 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Arrow and nanoarrow C structure wrappers
-
-These classes and their constructors wrap Arrow C Data/Stream interface
structures
-(i.e., ``ArrowArray``, ``ArrowSchema``, and ``ArrowArrayStream``) and the
-nanoarrow C library structures that help deserialize their content (i.e., the
-``ArrowSchemaView`` and ``ArrowArrayView``). These wrappers are currently
implemented
-in Cython and their scope is limited to lifecycle management and member access
as
-Python objects.
-"""
-
-from typing import Any, Iterable, Literal
-
-from nanoarrow._lib import (
- CArray,
- CArrayBuilder,
- CArrayStream,
- CArrayView,
- CArrowType,
- CBuffer,
- CBufferBuilder,
- CSchema,
- CSchemaBuilder,
- CSchemaView,
- NoneAwareWrapperIterator,
- _obj_is_buffer,
- _obj_is_capsule,
-)
-
-
-def c_schema(obj=None) -> CSchema:
- """ArrowSchema wrapper
-
- The ``CSchema`` class provides a Python-friendly interface to access the
fields
- of an ``ArrowSchema`` as defined in the Arrow C Data interface. These
objects
- are created using `nanoarrow.c_schema()`, which accepts any schema or
- data type-like object according to the Arrow PyCapsule interface.
-
- This Python wrapper allows access to schema struct members but does not
- automatically deserialize their content: use :func:`c_schema_view` to
validate
- and deserialize the content into a more easily inspectable object.
-
- Note that the :class:`CSchema` objects returned by ``.child()`` hold strong
- references to the original `ArrowSchema` to avoid copies while inspecting
an
- imported structure.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> schema = na.c_schema(pa.int32())
- >>> schema.is_valid()
- True
- >>> schema.format
- 'i'
- >>> schema.name
- ''
- """
-
- if isinstance(obj, CSchema):
- return obj
-
- if hasattr(obj, "__arrow_c_schema__"):
- return CSchema._import_from_c_capsule(obj.__arrow_c_schema__())
-
- if _obj_is_capsule(obj, "arrow_schema"):
- return CSchema._import_from_c_capsule(obj)
-
- # for pyarrow < 14.0
- if hasattr(obj, "_export_to_c"):
- out = CSchema.allocate()
- obj._export_to_c(out._addr())
- return out
- else:
- raise TypeError(
- f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_schema"
- )
-
-
-def c_array(obj, schema=None) -> CArray:
- """ArrowArray wrapper
-
- This class provides a user-facing interface to access the fields of an
ArrowArray
- as defined in the Arrow C Data interface, holding an optional reference to
a
- :class:`CSchema` that can be used to safely deserialize the content.
-
- These objects are created using :func:`c_array`, which accepts any
array-like
- object according to the Arrow PyCapsule interface, Python buffer protocol,
- or iterable of Python objects.
-
- This Python wrapper allows access to array fields but does not
automatically
- deserialize their content: use :func:`c_array_view` to validate and
deserialize
- the content into a more easily inspectable object.
-
- Note that the :class:`CArray` objects returned by ``.child()`` hold strong
- references to the original ``ArrowArray`` to avoid copies while inspecting
an
- imported structure.
-
- Parameters
- ----------
- obj : array-like
- An object supporting the Arrow PyCapsule interface, the Python buffer
- protocol, or an iterable of Python objects.
- schema : schema-like or None
- A schema-like object as sanitized by :func:`c_schema` or None. This
value
- will be used to request a data type from ``obj``; however, the
conversion
- is best-effort (i.e., the data type of the returned ``CArray`` may be
- different than ``schema``).
-
- Examples
- --------
-
- >>> import nanoarrow as na
- >>> # Create from iterable
- >>> array = na.c_array([1, 2, 3], na.int32())
- >>> # Create from Python buffer (e.g., numpy array)
- >>> import numpy as np
- >>> array = na.c_array(np.array([1, 2, 3]))
- >>> # Create from Arrow PyCapsule (e.g., pyarrow array)
- >>> import pyarrow as pa
- >>> array = na.c_array(pa.array([1, 2, 3]))
- >>> # Access array fields
- >>> array.length
- 3
- >>> array.null_count
- 0
- """
-
- if schema is not None:
- schema = c_schema(schema)
-
- if isinstance(obj, CArray) and schema is None:
- return obj
-
- # Try Arrow PyCapsule protocol
- if hasattr(obj, "__arrow_c_array__"):
- schema_capsule = None if schema is None else
schema.__arrow_c_schema__()
- return CArray._import_from_c_capsule(
- *obj.__arrow_c_array__(requested_schema=schema_capsule)
- )
-
- # Try buffer protocol (e.g., numpy arrays or a c_buffer())
- if _obj_is_buffer(obj):
- return _c_array_from_pybuffer(obj)
-
- # Try import of bare capsule
- if _obj_is_capsule(obj, "arrow_array"):
- if schema is None:
- schema_capsule = CSchema.allocate()._capsule
- else:
- schema_capsule = schema.__arrow_c_schema__()
-
- return CArray._import_from_c_capsule(schema_capsule, obj)
-
- # Try _export_to_c for Array/RecordBatch objects if pyarrow < 14.0
- if _obj_is_pyarrow_array(obj):
- out = CArray.allocate(CSchema.allocate())
- obj._export_to_c(out._addr(), out.schema._addr())
- return out
-
- # Try import of iterable
- if _obj_is_iterable(obj):
- return _c_array_from_iterable(obj, schema)
-
- raise TypeError(
- f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_array"
- )
-
-
-def c_array_from_buffers(
- schema,
- length: int,
- buffers: Iterable[Any],
- null_count: int = -1,
- offset: int = 0,
- children: Iterable[Any] = (),
- validation_level: Literal[None, "full", "default", "minimal", "none"] =
None,
- move: bool = False,
-) -> CArray:
- """Create an ArrowArray wrapper from components
-
- Given a schema, build an ArrowArray buffer-wise. This allows almost any
array
- to be assembled; however, requires some knowledge of the Arrow Columnar
- specification. This function will do its best to validate the sizes and
- content of buffers according to ``validation_level``; however, not all
- types of arrays can currently be validated when constructed in this way.
-
- Parameters
- ----------
- schema : schema-like
- The data type of the desired array as sanitized by :func:`c_schema`.
- length : int
- The length of the output array.
- buffers : Iterable of buffer-like or None
- An iterable of buffers as sanitized by :func:`c_buffer`. Any object
- supporting the Python Buffer protocol is accepted. Buffer data types
- are not checked. A buffer value of ``None`` will skip setting a buffer
- (i.e., that buffer will be of length zero and its pointer will
- be ``NULL``).
- null_count : int, optional
- The number of null values, if known in advance. If -1 (the default),
- the null count will be calculated based on the validity bitmap. If
- the validity bitmap was set to ``None``, the calculated null count
- will be zero.
- offset : int, optional
- The logical offset from the start of the array.
- children : Iterable of array-like
- An iterable of arrays used to set child fields of the array. Can
contain
- any object accepted by :func:`c_array`. Must contain the exact number
of
- required children as specifed by ``schema``.
- validation_level: None or str, optional
- One of "none" (no check), "minimal" (check buffer sizes that do not
require
- dereferencing buffer content), "default" (check all buffer sizes), or
"full"
- (check all buffer sizes and all buffer content). The default, ``None``,
- will validate at the "default" level where possible.
- move : bool, optional
- Use ``True`` to move ownership of any input buffers or children to the
- output array.
-
- Examples
- --------
-
- >>> import nanoarrow as na
- >>> c_array = na.c_array_from_buffers(na.uint8(), 5, [None, b"12345"])
- >>> na.c_array_view(c_array)
- <nanoarrow.c_lib.CArrayView>
- - storage_type: 'uint8'
- - length: 5
- - offset: 0
- - null_count: 0
- - buffers[2]:
- - validity <bool[0 b] >
- - data <uint8[5 b] 49 50 51 52 53>
- - dictionary: NULL
- - children[0]:
- """
- schema = c_schema(schema)
- builder = CArrayBuilder.allocate()
-
- # Ensures that the output array->n_buffers is set and that the correct
number
- # of children have been initialized.
- builder.init_from_schema(schema)
-
- # Set buffers, optionally moving ownership of the buffers as well (i.e.,
- # the objects in the input buffers would be replaced with an empty
ArrowBuffer)
- for i, buffer in enumerate(buffers):
- if buffer is None:
- continue
-
- # If we're setting a CBuffer from something else, we can avoid an extra
- # level of Python wrapping by using move=True
- move = move or not isinstance(buffer, CBuffer)
- builder.set_buffer(i, c_buffer(buffer), move=move)
-
- # Set children, optionally moving ownership of the children as well (i.e.,
- # the objects in the input children would be marked released).
- n_children = 0
- for child_src in children:
- # If we're setting a CArray from something else, we can avoid an extra
- # level of Python wrapping by using move=True
- move = move or not isinstance(child_src, CArray)
- builder.set_child(n_children, c_array(child_src), move=move)
- n_children += 1
-
- if n_children != schema.n_children:
- raise ValueError(f"Expected {schema.n_children} children but got
{n_children}")
-
- # Set array fields
- builder.set_length(length)
- builder.set_offset(offset)
- builder.set_null_count(null_count)
-
- # Calculates the null count if -1 (and if applicable)
- builder.resolve_null_count()
-
- # Validate + finish
- return builder.finish(validation_level=validation_level)
-
-
-def c_array_stream(obj=None, schema=None) -> CArrayStream:
- """ArrowArrayStream wrapper
-
- This class provides a user-facing interface to access the fields of
- an ArrowArrayStream as defined in the Arrow C Stream interface.
- These objects are usually created using `nanoarrow.c_array_stream()`.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> pa_column = pa.array([1, 2, 3], pa.int32())
- >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
- >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
- >>> array_stream = na.c_array_stream(pa_reader)
- >>> array_stream.get_schema()
- <nanoarrow.c_lib.CSchema struct>
- - format: '+s'
- - name: ''
- - flags: 0
- - metadata: NULL
- - dictionary: NULL
- - children[1]:
- 'col1': <nanoarrow.c_lib.CSchema int32>
- - format: 'i'
- - name: 'col1'
- - flags: 2
- - metadata: NULL
- - dictionary: NULL
- - children[0]:
- >>> array_stream.get_next().length
- 3
- >>> array_stream.get_next() is None
- Traceback (most recent call last):
- ...
- StopIteration
- """
-
- if schema is not None:
- schema = c_schema(schema)
-
- if isinstance(obj, CArrayStream) and schema is None:
- return obj
-
- # Try capsule protocol
- if hasattr(obj, "__arrow_c_stream__"):
- schema_capsule = None if schema is None else
schema.__arrow_c_schema__()
- return CArrayStream._import_from_c_capsule(
- obj.__arrow_c_stream__(requested_schema=schema_capsule)
- )
-
- # Try import of bare capsule
- if _obj_is_capsule(obj, "arrow_array_stream"):
- if schema is not None:
- raise TypeError(
- "Can't import c_array_stream from capsule with requested
schema"
- )
- return CArrayStream._import_from_c_capsule(obj)
-
- # Try _export_to_c for RecordBatchReader objects if pyarrow < 14.0
- if _obj_is_pyarrow_record_batch_reader(obj):
- out = CArrayStream.allocate()
- obj._export_to_c(out._addr())
- return out
-
- try:
- array = c_array(obj, schema=schema)
- return CArrayStream.from_array_list([array], array.schema,
validate=False)
- except Exception as e:
- raise TypeError(
- f"An error occurred whilst converting {type(obj).__name__} "
- f"to nanoarrow.c_array_stream or nanoarrow.c_array: \n {e}"
- ) from e
-
-
-def c_schema_view(obj) -> CSchemaView:
- """ArrowSchemaView wrapper
-
- The ``ArrowSchemaView`` is a nanoarrow C library structure that facilitates
- access to the deserialized content of an ``ArrowSchema`` (e.g., parameter
values for
- parameterized types). This wrapper extends that facility to Python.
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> schema = na.c_schema(pa.decimal128(10, 3))
- >>> schema_view = na.c_schema_view(schema)
- >>> schema_view.type
- 'decimal128'
- >>> schema_view.decimal_bitwidth
- 128
- >>> schema_view.decimal_precision
- 10
- >>> schema_view.decimal_scale
- 3
- """
-
- if isinstance(obj, CSchemaView):
- return obj
-
- return CSchemaView(c_schema(obj))
-
-
-def c_array_view(obj, schema=None) -> CArrayView:
- """ArrowArrayView wrapper
-
- The ``ArrowArrayView`` is a nanoarrow C library structure that provides
- structured access to buffers addresses, buffer sizes, and buffer
- data types. The buffer data is usually propagated from an ArrowArray
- but can also be propagated from other types of objects (e.g., serialized
- IPC). The offset and length of this view are independent of its parent
- (i.e., this object can also represent a slice of its parent).
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import numpy as np
- >>> import nanoarrow as na
- >>> array = na.c_array(pa.array(["one", "two", "three", None]))
- >>> array_view = na.c_array_view(array)
- >>> np.array(array_view.buffer(1))
- array([ 0, 3, 6, 11, 11], dtype=int32)
- >>> np.array(array_view.buffer(2))
- array([b'o', b'n', b'e', b't', b'w', b'o', b't', b'h', b'r', b'e', b'e'],
- dtype='|S1')
- """
-
- if isinstance(obj, CArrayView) and schema is None:
- return obj
-
- return c_array(obj, schema).view()
-
-
-def c_buffer(obj, schema=None) -> CBuffer:
- """Owning, read-only ArrowBuffer wrapper
-
- If obj implements the Python buffer protocol, ``c_buffer()`` wraps
- obj in nanoarrow's owning buffer structure, the ArrowBuffer,
- such that it can be used to construct arrays. The ownership of the
- underlying buffer is handled by the Python buffer protocol
- (i.e., ``PyObject_GetBuffer()`` and ``PyBuffer_Release()``).
-
- If obj is iterable, a buffer will be allocated and populated with
- the contents of obj according to ``schema``. The
- ``schema`` parameter is required to create a buffer from
- a Python iterable. The ``struct`` module is currently used to encode
- values from obj into binary form.
-
- Unlike with :func:`c_array`, ``schema`` is explicitly
- honoured (or an error will be raised).
-
- Parameters
- ----------
-
- obj : buffer-like or iterable
- A Python object that supports the Python buffer protocol. This includes
- bytes, memoryview, bytearray, bulit-in types as well as numpy arrays.
- schema : schema-like, optional
- The data type of the desired buffer as sanitized by
- :func:`c_schema`. Only values that make sense as buffer types are
- allowed (e.g., integer types, floating-point types, interval types,
- decimal types, binary, string, fixed-size binary).
-
- Examples
- --------
-
- >>> import nanoarrow as na
- >>> na.c_buffer(b"1234")
- nanoarrow.c_lib.CBuffer(uint8[4 b] 49 50 51 52)
- >>> na.c_buffer([1, 2, 3], na.int32())
- nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
- """
- if isinstance(obj, CBuffer) and schema is None:
- return obj
-
- if _obj_is_buffer(obj):
- if schema is not None:
- raise NotImplementedError(
- "c_buffer() with schema for pybuffer is not implemented"
- )
- return CBuffer.from_pybuffer(obj)
-
- if _obj_is_iterable(obj):
- buffer, _ = _c_buffer_from_iterable(obj, schema)
- return buffer
-
- raise TypeError(
- f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_buffer"
- )
-
-
-def allocate_c_schema() -> CSchema:
- """Allocate an uninitialized ArrowSchema wrapper
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> schema = na.allocate_c_schema()
- >>> pa.int32()._export_to_c(schema._addr())
- """
- return CSchema.allocate()
-
-
-def allocate_c_array(schema=None) -> CArray:
- """Allocate an uninitialized ArrowArray
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> schema = na.allocate_c_schema()
- >>> pa.int32()._export_to_c(schema._addr())
- """
- if schema is not None:
- schema = c_schema(schema)
-
- return CArray.allocate(CSchema.allocate() if schema is None else schema)
-
-
-def allocate_c_array_stream() -> CArrayStream:
- """Allocate an uninitialized ArrowArrayStream wrapper
-
- Examples
- --------
-
- >>> import pyarrow as pa
- >>> import nanoarrow as na
- >>> pa_column = pa.array([1, 2, 3], pa.int32())
- >>> pa_batch = pa.record_batch([pa_column], names=["col1"])
- >>> pa_reader = pa.RecordBatchReader.from_batches(pa_batch.schema,
[pa_batch])
- >>> array_stream = na.allocate_c_array_stream()
- >>> pa_reader._export_to_c(array_stream._addr())
- """
- return CArrayStream.allocate()
-
-
-# This is a heuristic for detecting a pyarrow.Array or pyarrow.RecordBatch
-# for pyarrow < 14.0.0, after which the the __arrow_c_array__ protocol
-# is sufficient to detect such an array. This check can't use isinstance()
-# to avoid importing pyarrow unnecessarily.
-def _obj_is_pyarrow_array(obj):
- obj_type = type(obj)
- if not obj_type.__module__.startswith("pyarrow"):
- return False
-
- if not obj_type.__name__.endswith("Array") and obj_type.__name__ !=
"RecordBatch":
- return False
-
- return hasattr(obj, "_export_to_c")
-
-
-def _obj_is_pyarrow_record_batch_reader(obj):
- obj_type = type(obj)
- if not obj_type.__module__.startswith("pyarrow"):
- return False
-
- if not obj_type.__name__.endswith("RecordBatchReader"):
- return False
-
- return hasattr(obj, "_export_to_c")
-
-
-def _obj_is_iterable(obj):
- return hasattr(obj, "__iter__")
-
-
-# Invokes the buffer protocol on obj
-def _c_array_from_pybuffer(obj) -> CArray:
- buffer = CBuffer.from_pybuffer(obj)
- type_id = buffer.data_type_id
- element_size_bits = buffer.element_size_bits
-
- builder = CArrayBuilder.allocate()
-
- # Fixed-size binary needs a schema
- if type_id == CArrowType.BINARY and element_size_bits != 0:
- c_schema = (
- CSchemaBuilder.allocate()
- .set_type_fixed_size(CArrowType.FIXED_SIZE_BINARY,
element_size_bits // 8)
- .finish()
- )
- builder.init_from_schema(c_schema)
- elif type_id == CArrowType.STRING:
- builder.init_from_type(int(CArrowType.INT8))
- elif type_id == CArrowType.BINARY:
- builder.init_from_type(int(CArrowType.UINT8))
- else:
- builder.init_from_type(int(type_id))
-
- # Set the length
- builder.set_length(len(buffer))
-
- # Move ownership of the ArrowBuffer wrapped by buffer to builder.buffer(1)
- builder.set_buffer(1, buffer)
-
- # No nulls or offset from a PyBuffer
- builder.set_null_count(0)
- builder.set_offset(0)
-
- return builder.finish()
-
-
-def _c_array_from_iterable(obj, schema=None) -> CArray:
- if schema is None:
- raise ValueError("schema is required for CArray import from iterable")
-
- obj_len = -1
- if hasattr(obj, "__len__"):
- obj_len = len(obj)
-
- # We can always create an array from an empty iterable, even for types
- # not supported by _c_buffer_from_iterable()
- if obj_len == 0:
- builder = CArrayBuilder.allocate()
- builder.init_from_schema(schema)
- builder.start_appending()
- return builder.finish()
-
- # We need to know a few things about the data type to choose the
appropriate
- # strategy for building the array.
- schema_view = c_schema_view(schema)
-
- if schema_view.storage_type_id != schema_view.type_id:
- raise ValueError(
- f"Can't create array from iterable for type {schema_view.type}"
- )
-
- # Handle variable-size binary types (string, binary)
- if schema_view.type_id in (CArrowType.STRING, CArrowType.LARGE_STRING):
- builder = CArrayBuilder.allocate()
- builder.init_from_schema(schema)
- builder.start_appending()
- builder.append_strings(obj)
- return builder.finish()
- elif schema_view.type_id in (CArrowType.BINARY, CArrowType.LARGE_BINARY):
- builder = CArrayBuilder.allocate()
- builder.init_from_schema(schema)
- builder.start_appending()
- builder.append_bytes(obj)
- return builder.finish()
-
- # Creating a buffer from an iterable does not handle None values,
- # but we can do so here with the NoneAwareWrapperIterator() wrapper.
- # This approach is quite a bit slower, so only do it for a nullable
- # type.
- if schema_view.nullable:
- obj_wrapper = NoneAwareWrapperIterator(
- obj, schema_view.storage_type_id, schema_view.fixed_size
- )
-
- if obj_len > 0:
- obj_wrapper.reserve(obj_len)
-
- data, _ = _c_buffer_from_iterable(obj_wrapper, schema_view)
- n_values, null_count, validity = obj_wrapper.finish()
- else:
- data, n_values = _c_buffer_from_iterable(obj, schema_view)
- null_count = 0
- validity = None
-
- return c_array_from_buffers(
- schema, n_values, buffers=(validity, data), null_count=null_count,
move=True
- )
-
-
-def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
- import array
-
- # array.typecodes is not available in all PyPy versions.
- # Rather than guess, just don't use the array constructor if
- # this attribute is not available.
- if hasattr(array, "typecodes"):
- array_typecodes = array.typecodes
- else:
- array_typecodes = []
-
- if schema is None:
- raise ValueError("CBuffer from iterable requires schema")
-
- schema_view = c_schema_view(schema)
- if schema_view.extension_name is not None:
- raise ValueError(
- f"Can't create buffer from extension type
{schema_view.extension_name}"
- )
- elif schema_view.storage_type_id != schema_view.type_id:
- raise ValueError(
- f"Can't create buffer from iterable for type {schema_view.type}"
- )
-
- builder = CBufferBuilder()
-
- if schema_view.storage_type_id == CArrowType.FIXED_SIZE_BINARY:
- builder.set_data_type(CArrowType.BINARY, schema_view.fixed_size * 8)
- else:
- builder.set_data_type(schema_view.storage_type_id)
-
- # If we are using a typecode supported by the array module, it has much
- # faster implementations of safely building buffers from iterables
- if (
- builder.format in array_typecodes
- and schema_view.storage_type_id != CArrowType.BOOL
- ):
- buf = array.array(builder.format, obj)
- return CBuffer.from_pybuffer(buf), len(buf)
-
- n_values = builder.write_elements(obj)
- return builder.finish(), n_values
diff --git a/python/src/nanoarrow/c_schema.py b/python/src/nanoarrow/c_schema.py
new file mode 100644
index 00000000..ea1fcf02
--- /dev/null
+++ b/python/src/nanoarrow/c_schema.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Arrow and nanoarrow C structure wrappers
+
+These classes and their constructors wrap Arrow C Data/Stream interface
structures
+(i.e., ``ArrowArray``, ``ArrowSchema``, and ``ArrowArrayStream``) and the
+nanoarrow C library structures that help deserialize their content (i.e., the
+``ArrowSchemaView`` and ``ArrowArrayView``). These wrappers are currently
implemented
+in Cython and their scope is limited to lifecycle management and member access
as
+Python objects.
+"""
+
+
+from nanoarrow._lib import CSchema, CSchemaView, _obj_is_capsule
+
+
+def c_schema(obj=None) -> CSchema:
+ """ArrowSchema wrapper
+
+ The ``CSchema`` class provides a Python-friendly interface to access the
fields
+ of an ``ArrowSchema`` as defined in the Arrow C Data interface. These
objects
+ are created using `nanoarrow.c_schema()`, which accepts any schema or
+ data type-like object according to the Arrow PyCapsule interface.
+
+ This Python wrapper allows access to schema struct members but does not
+ automatically deserialize their content: use :func:`c_schema_view` to
validate
+ and deserialize the content into a more easily inspectable object.
+
+ Note that the :class:`CSchema` objects returned by ``.child()`` hold strong
+ references to the original `ArrowSchema` to avoid copies while inspecting
an
+ imported structure.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.c_schema(pa.int32())
+ >>> schema.is_valid()
+ True
+ >>> schema.format
+ 'i'
+ >>> schema.name
+ ''
+ """
+
+ if isinstance(obj, CSchema):
+ return obj
+
+ if hasattr(obj, "__arrow_c_schema__"):
+ return CSchema._import_from_c_capsule(obj.__arrow_c_schema__())
+
+ if _obj_is_capsule(obj, "arrow_schema"):
+ return CSchema._import_from_c_capsule(obj)
+
+ # for pyarrow < 14.0
+ if hasattr(obj, "_export_to_c"):
+ out = CSchema.allocate()
+ obj._export_to_c(out._addr())
+ return out
+ else:
+ raise TypeError(
+ f"Can't convert object of type {type(obj).__name__} to
nanoarrow.c_schema"
+ )
+
+
+def c_schema_view(obj) -> CSchemaView:
+ """ArrowSchemaView wrapper
+
+ The ``ArrowSchemaView`` is a nanoarrow C library structure that facilitates
+ access to the deserialized content of an ``ArrowSchema`` (e.g., parameter
values for
+ parameterized types). This wrapper extends that facility to Python.
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.c_schema(pa.decimal128(10, 3))
+ >>> schema_view = na.c_schema_view(schema)
+ >>> schema_view.type
+ 'decimal128'
+ >>> schema_view.decimal_bitwidth
+ 128
+ >>> schema_view.decimal_precision
+ 10
+ >>> schema_view.decimal_scale
+ 3
+ """
+
+ if isinstance(obj, CSchemaView):
+ return obj
+
+ return CSchemaView(c_schema(obj))
+
+
+def allocate_c_schema() -> CSchema:
+ """Allocate an uninitialized ArrowSchema wrapper
+
+ Examples
+ --------
+
+ >>> import pyarrow as pa
+ >>> import nanoarrow as na
+ >>> schema = na.allocate_c_schema()
+ >>> pa.int32()._export_to_c(schema._addr())
+ """
+ return CSchema.allocate()
diff --git a/python/src/nanoarrow/device.py b/python/src/nanoarrow/device.py
index 7bf0dcea..76f9784f 100644
--- a/python/src/nanoarrow/device.py
+++ b/python/src/nanoarrow/device.py
@@ -16,7 +16,8 @@
# under the License.
from nanoarrow._lib import DEVICE_CPU, CDeviceArray, Device, DeviceType #
noqa: F401
-from nanoarrow.c_lib import c_array, c_schema
+from nanoarrow.c_array import c_array
+from nanoarrow.c_schema import c_schema
def cpu():
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index 6a535094..aef56263 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -21,7 +21,8 @@ from itertools import islice
from typing import Iterable, Tuple
from nanoarrow._lib import CArrayView, CArrowType
-from nanoarrow.c_lib import c_array_stream, c_schema, c_schema_view
+from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema, c_schema_view
def iter_py(obj, schema=None) -> Iterable:
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index e521a342..97bb45b8 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -27,7 +27,7 @@ from nanoarrow._lib import (
CSchemaView,
SchemaMetadata,
)
-from nanoarrow.c_lib import c_schema
+from nanoarrow.c_schema import c_schema
class Type(enum.Enum):
diff --git a/python/tests/test_array.py b/python/tests/test_array.py
index 02ba53bc..787c9761 100644
--- a/python/tests/test_array.py
+++ b/python/tests/test_array.py
@@ -16,7 +16,7 @@
# under the License.
import pytest
-from nanoarrow.c_lib import CArrayStream
+from nanoarrow.c_array_stream import CArrayStream
import nanoarrow as na
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index 91a7af98..b64370da 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -16,23 +16,11 @@
# under the License.
import pytest
-from nanoarrow._lib import NanoarrowException
-from nanoarrow.c_lib import CArrayBuilder
+from nanoarrow._lib import CArrayBuilder, NanoarrowException
import nanoarrow as na
-def test_c_array_builder_init():
- builder = CArrayBuilder.allocate()
- builder.init_from_type(na.Type.INT32.value)
-
- with pytest.raises(RuntimeError, match="CArrayBuilder is already
initialized"):
- builder.init_from_type(na.Type.INT32.value)
-
- with pytest.raises(RuntimeError, match="CArrayBuilder is already
initialized"):
- builder.init_from_schema(na.c_schema(na.int32()))
-
-
def test_c_array_from_c_array():
c_array = na.c_array([1, 2, 3], na.int32())
c_array_from_c_array = na.c_array(c_array)
@@ -81,7 +69,7 @@ def test_c_array_from_old_pyarrow():
# Make sure that this heuristic won't result in trying to import
# something else that has an _export_to_c method
- with pytest.raises(TypeError, match="Can't convert object of type
DataType"):
+ with pytest.raises(TypeError, match="Can't resolve ArrayBuilder"):
not_array = pa.int32()
assert hasattr(not_array, "_export_to_c")
na.c_array(not_array)
@@ -107,7 +95,8 @@ def test_c_array_from_bare_capsule():
def test_c_array_type_not_supported():
- with pytest.raises(TypeError, match="Can't convert object of type
NoneType"):
+ msg = "Can't resolve ArrayBuilder for object of type NoneType"
+ with pytest.raises(TypeError, match=msg):
na.c_array(None)
@@ -151,6 +140,22 @@ def test_c_array_slice_errors():
array[1:0]
+def test_c_array_builder_init():
+ builder = CArrayBuilder.allocate()
+
+ with pytest.raises(RuntimeError, match="CArrayBuilder is not initialized"):
+ builder.is_empty()
+
+ builder.init_from_type(na.Type.INT32.value)
+ assert builder.is_empty()
+
+ with pytest.raises(RuntimeError, match="CArrayBuilder is already
initialized"):
+ builder.init_from_type(na.Type.INT32.value)
+
+ with pytest.raises(RuntimeError, match="CArrayBuilder is already
initialized"):
+ builder.init_from_schema(na.c_schema(na.int32()))
+
+
def test_c_array_from_pybuffer_uint8():
data = b"abcdefg"
c_array = na.c_array(data)
@@ -230,7 +235,7 @@ def test_c_array_from_iterable_string():
assert len(array_view.buffer(2)) == 7
# Check an item that is not a str()
- with pytest.raises(TypeError):
+ with pytest.raises(ValueError):
na.c_array([b"1234"], na.string())
@@ -244,7 +249,7 @@ def test_c_array_from_iterable_bytes():
assert len(array_view.buffer(1)) == 4
assert len(array_view.buffer(2)) == 7
- with pytest.raises(TypeError):
+ with pytest.raises(ValueError):
na.c_array(["1234"], na.binary())
buf_not_bytes = na.c_buffer([1, 2, 3], na.int32())
diff --git a/python/tests/test_c_array_stream.py
b/python/tests/test_c_array_stream.py
index fc6925d8..0fe38f4a 100644
--- a/python/tests/test_c_array_stream.py
+++ b/python/tests/test_c_array_stream.py
@@ -17,7 +17,7 @@
import pytest
from nanoarrow._lib import NanoarrowException
-from nanoarrow.c_lib import CArrayStream
+from nanoarrow.c_array_stream import CArrayStream
import nanoarrow as na
@@ -101,7 +101,7 @@ def test_c_array_stream_from_c_array_fallback():
def test_c_array_stream_error():
- msg = "Can't convert object of type NoneType"
+ msg = "Can't resolve ArrayBuilder"
with pytest.raises(TypeError, match=msg):
na.c_array_stream(None)
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 68277921..40ba4a3b 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -19,7 +19,7 @@ import struct
import sys
import pytest
-from nanoarrow.c_lib import CBuffer, CBufferBuilder
+from nanoarrow._lib import CBuffer, CBufferBuilder
import nanoarrow as na
@@ -265,7 +265,7 @@ def test_c_buffer_from_iterable():
na.c_buffer([1, 2, 3], na.date32())
with pytest.raises(ValueError, match="Can't create buffer"):
- na.c_buffer([1, 2, 3], na.extension_type(na.null(), "arrow.test"))
+ na.c_buffer([1, 2, 3], na.extension_type(na.int32(), "arrow.test"))
def test_c_buffer_from_fixed_size_binary_iterable():
diff --git a/python/tests/test_c_buffer_view.py
b/python/tests/test_c_buffer_view.py
index 79ac2a1b..b885488e 100644
--- a/python/tests/test_c_buffer_view.py
+++ b/python/tests/test_c_buffer_view.py
@@ -16,13 +16,12 @@
# under the License.
import pytest
-from nanoarrow.c_lib import c_array_view
import nanoarrow as na
def test_buffer_view_bool():
- bool_array_view = c_array_view([1, 0, 0, 1], na.bool())
+ bool_array_view = na.c_array_view([1, 0, 0, 1], na.bool())
view = bool_array_view.buffer(1)
assert view.element_size_bits == 1
@@ -64,7 +63,7 @@ def test_buffer_view_bool():
def test_buffer_view_non_bool():
- array_view = c_array_view([1, 2, 3, 5], na.int32())
+ array_view = na.c_array_view([1, 2, 3, 5], na.int32())
view = array_view.buffer(1)
assert view.element_size_bits == 32
diff --git a/python/tests/test_c_schema_view.py
b/python/tests/test_c_schema_view.py
index 968415a2..74309b0a 100644
--- a/python/tests/test_c_schema_view.py
+++ b/python/tests/test_c_schema_view.py
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-from nanoarrow.c_lib import c_schema_view
+from nanoarrow.c_schema import c_schema_view
import nanoarrow as na
diff --git a/python/tests/test_capsules.py b/python/tests/test_capsules.py
index 926aefa4..149b3ff0 100644
--- a/python/tests/test_capsules.py
+++ b/python/tests/test_capsules.py
@@ -143,25 +143,29 @@ def test_export_invalid():
def test_import_from_c_errors():
+ from nanoarrow.c_array import CArray
+ from nanoarrow.c_array_stream import CArrayStream
+ from nanoarrow.c_schema import CSchema
+
# ensure proper error is raised in case of wrong object or wrong capsule
pa_arr = pa.array([1, 2, 3], pa.int32())
with pytest.raises(ValueError):
- na.c_lib.CSchema._import_from_c_capsule("wrong")
+ CSchema._import_from_c_capsule("wrong")
with pytest.raises(ValueError):
- na.c_lib.CSchema._import_from_c_capsule(pa_arr.__arrow_c_array__())
+ CSchema._import_from_c_capsule(pa_arr.__arrow_c_array__())
with pytest.raises(ValueError):
- na.c_lib.CArray._import_from_c_capsule("wrong", "wrong")
+ CArray._import_from_c_capsule("wrong", "wrong")
with pytest.raises(ValueError):
- na.c_lib.CArray._import_from_c_capsule(
+ CArray._import_from_c_capsule(
pa_arr.__arrow_c_array__(), pa_arr.type.__arrow_c_schema__()
)
with pytest.raises(ValueError):
- na.c_lib.CArrayStream._import_from_c_capsule("wrong")
+ CArrayStream._import_from_c_capsule("wrong")
with pytest.raises(ValueError):
-
na.c_lib.CArrayStream._import_from_c_capsule(pa_arr.__arrow_c_array__())
+ CArrayStream._import_from_c_capsule(pa_arr.__arrow_c_array__())
diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py
index 9ee48463..99904774 100644
--- a/python/tests/test_nanoarrow.py
+++ b/python/tests/test_nanoarrow.py
@@ -32,22 +32,26 @@ def test_c_version():
def test_c_schema_helper():
+ from nanoarrow.c_schema import CSchema
+
schema = na.allocate_c_schema()
assert na.c_schema(schema) is schema
schema = na.c_schema(pa.null())
- assert isinstance(schema, na.c_lib.CSchema)
+ assert isinstance(schema, CSchema)
with pytest.raises(TypeError):
na.c_schema(1234)
def test_c_array_helper():
+ from nanoarrow.c_array import CArray
+
array = na.allocate_c_array()
assert na.c_array(array) is array
array = na.c_array(pa.array([], pa.null()))
- assert isinstance(array, na.c_lib.CArray)
+ assert isinstance(array, CArray)
with pytest.raises(TypeError):
na.c_array(1234)
@@ -62,9 +66,11 @@ def test_array_stream_helper():
def test_array_view_helper():
+ from nanoarrow.c_array import CArrayView
+
array = na.c_array(pa.array([1, 2, 3]))
view = na.c_array_view(array)
- assert isinstance(view, na.c_lib.CArrayView)
+ assert isinstance(view, CArrayView)
assert na.c_array_view(view) is view