This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new adcbc770 feat(python): Implement extension type and Schema metadata
support (#431)
adcbc770 is described below
commit adcbc770ccb61b2f2557428493fe1264fc3c02f4
Author: Dewey Dunnington <[email protected]>
AuthorDate: Fri Apr 26 23:14:08 2024 -0300
feat(python): Implement extension type and Schema metadata support (#431)
The initial motivation of this PR was to ensure that extension types are
handled in nanoarrow's Schemas; however, this exposed that metadata was
not handled by the `Schema` in either create or consume mode.
After this PR, extension types can be created and inspected. This is
really just creating a schema with some metadata (and looking for
specific metadata when consuming).
```python
import nanoarrow as na
schema = na.extension_type(na.int32(), "arrow.example", b'{"some_param":
1234}')
if schema.type == na.Type.EXTENSION:
print(schema.extension.name, schema.extension.metadata)
#> arrow.example b'{"some_param": 1234}'
```
In doing some testing, there were a number of places where extension
schemas/arrays were implicitly treated as their storage types. I've
tried to error or warn for these cases as much as possible:
```python
import nanoarrow as na
ext = na.extension_type(na.int32(), "arrow.example")
na.Array([1, 2, 3], ext)
#> TypeError: ...
#> Can't create buffer from extension type arrow.example
```
It's a little hard to create an extension array at the moment (and there
should probably be a similar option to strip the extension type from an
`Array` to just get the storage), but I think that is maybe a job for
another PR that is more about arrays and less about schemas:
```python
import nanoarrow as na
ext = na.extension_type(na.int32(), "arrow.example")
storage = na.c_array([1, 2, 3], ext.extension.storage)
_, storage_capsule = storage.__arrow_c_array__()
extension = na.Array(storage_capsule, ext)
list(extension.iter_py())
#> UnregisteredExtensionWarning: <unnamed int32>: Converting unregistered
extension 'arrow.example' as storage type
#> [1, 2, 3]
```
A side effect of all of this is that there is better support for
modifying schemas:
```python
import nanoarrow as na
na.Schema(na.int32(), name="some_col")
#> Schema(INT32, name='some_col')
schema = na.Schema(na.int32(), metadata={"some_key": "some_value"})
schema.metadata[b"some_key"]
#> b'some_value
na.c_schema(na.int32()).modify(
name="some_col",
metadata={"some_key": "some_value"},
nullable=False
)
#> <nanoarrow.c_lib.CSchema int32>
#> - format: 'i'
#> - name: 'some_col'
#> - flags: 0
#> - metadata:
#> - b'some_key': b'some_value'
#> - dictionary: NULL
#> - children[0]:
```
---------
Co-authored-by: Dane Pitkin <[email protected]>
---
python/src/nanoarrow/__init__.py | 4 +-
python/src/nanoarrow/_lib.pyx | 102 ++++++++++++++++-
python/src/nanoarrow/_repr_utils.py | 2 +-
python/src/nanoarrow/c_lib.py | 6 +-
python/src/nanoarrow/iterator.py | 15 +++
python/src/nanoarrow/schema.py | 217 ++++++++++++++++++++++++++----------
python/tests/test_c_buffer.py | 3 +
python/tests/test_c_schema.py | 157 ++++++++++++++++++++++++++
python/tests/test_iterator.py | 11 ++
python/tests/test_nanoarrow.py | 104 -----------------
python/tests/test_schema.py | 50 ++++++++-
11 files changed, 496 insertions(+), 175 deletions(-)
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index a86e8691..c73f60e9 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -41,7 +41,6 @@ from nanoarrow.schema import (
Schema,
Type,
TimeUnit,
- schema,
null,
bool,
int8,
@@ -65,6 +64,7 @@ from nanoarrow.schema import (
time32,
time64,
timestamp,
+ extension_type,
duration,
interval_months,
interval_day_time,
@@ -102,6 +102,7 @@ __all__ = [
"decimal128",
"decimal256",
"duration",
+ "extension_type",
"fixed_size_binary",
"float16",
"float32",
@@ -116,7 +117,6 @@ __all__ = [
"large_binary",
"large_string",
"null",
- "schema",
"string",
"struct",
"time32",
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index e215f01a..752a009c 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -34,7 +34,7 @@ generally have better autocomplete + documentation available
to IDEs).
from libc.stdint cimport uintptr_t, uint8_t, int64_t
from libc.string cimport memcpy
from libc.stdio cimport snprintf
-from cpython.bytes cimport PyBytes_FromStringAndSize
+from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AsString,
PyBytes_Size
from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer,
PyCapsule_IsValid
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cpython cimport (
@@ -769,6 +769,28 @@ cdef class CSchema:
else:
return None
+ def modify(self, *, name=None, flags=None, nullable=None, metadata=None,
+ validate=True):
+ builder = CSchemaBuilder.copy(self)
+
+ if name is not None:
+ builder.set_name(name)
+
+ if flags is not None:
+ builder.set_flags(flags)
+
+ if nullable is not None:
+ builder.set_nullable(nullable)
+
+ if metadata is not None:
+ builder.clear_metadata()
+ builder.append_metadata(metadata)
+
+ if validate:
+ builder.validate()
+
+ return builder.finish()
+
cdef class CSchemaView:
"""Low-level ArrowSchemaView wrapper
@@ -967,10 +989,50 @@ cdef class CSchemaBuilder:
if self._ptr.release == NULL:
ArrowSchemaInit(self._ptr)
+ @staticmethod
+ def copy(CSchema schema):
+ return CSchemaBuilder(schema.__deepcopy__())
+
@staticmethod
def allocate():
return CSchemaBuilder(CSchema.allocate())
+ def clear_metadata(self):
+ cdef int code = ArrowSchemaSetMetadata(self.c_schema._ptr, NULL)
+ Error.raise_error_not_ok("ArrowSchemaSetMetadata()", code)
+ return self
+
+ def append_metadata(self, metadata):
+ cdef CBuffer buffer = CBuffer.empty()
+
+ cdef const char* existing_metadata = self.c_schema._ptr.metadata
+ cdef int code = ArrowMetadataBuilderInit(buffer._ptr,
existing_metadata)
+ Error.raise_error_not_ok("ArrowMetadataBuilderInit()", code)
+
+ cdef ArrowStringView key
+ cdef ArrowStringView value
+ cdef int32_t keys_added = 0
+
+ for k, v in metadata.items():
+ k = k.encode() if isinstance(k, str) else bytes(k)
+ key.data = PyBytes_AsString(k)
+ key.size_bytes = PyBytes_Size(k)
+
+ v = v.encode() if isinstance(v, str) else bytes(v)
+ value.data = PyBytes_AsString(v)
+ value.size_bytes = PyBytes_Size(v)
+
+ code = ArrowMetadataBuilderAppend(buffer._ptr, key, value)
+ Error.raise_error_not_ok("ArrowMetadataBuilderAppend()", code)
+
+ keys_added += 1
+
+ if keys_added > 0:
+ code = ArrowSchemaSetMetadata(self.c_schema._ptr, <const
char*>buffer._ptr.data)
+ Error.raise_error_not_ok("ArrowSchemaSetMetadata()", code)
+
+ return self
+
def child(self, int64_t i):
return CSchemaBuilder(self.c_schema.child(i))
@@ -1058,6 +1120,10 @@ cdef class CSchemaBuilder:
return self
+ def set_flags(self, flags):
+ self._ptr.flags = flags
+ return self
+
def set_nullable(self, nullable):
if nullable:
self._ptr.flags = self._ptr.flags | ARROW_FLAG_NULLABLE
@@ -1066,6 +1132,9 @@ cdef class CSchemaBuilder:
return self
+ def validate(self):
+ return CSchemaView(self.c_schema)
+
def finish(self):
self.c_schema._assert_valid()
@@ -1463,7 +1532,11 @@ cdef class SchemaMetadata:
self._base = base
self._metadata = <const char*>ptr
- def _init_reader(self):
+ @staticmethod
+ def empty():
+ return SchemaMetadata(None, 0)
+
+ cdef _init_reader(self):
cdef int code = ArrowMetadataReaderInit(&self._reader, self._metadata)
Error.raise_error_not_ok("ArrowMetadataReaderInit()", code)
@@ -1471,13 +1544,36 @@ cdef class SchemaMetadata:
self._init_reader()
return self._reader.remaining_keys
+ def __contains__(self, item):
+ for key, _ in self.items():
+ if item == key:
+ return True
+
+ return False
+
+ def __getitem__(self, k):
+ out = None
+
+ for key, value in self.items():
+ if k == key:
+ if out is None:
+ out = value
+ else:
+ raise KeyError(f"key {k} matches more than one value in
metadata")
+
+ return out
+
def __iter__(self):
+ for key, _ in self.items():
+ yield key
+
+ def items(self):
cdef ArrowStringView key
cdef ArrowStringView value
self._init_reader()
while self._reader.remaining_keys > 0:
ArrowMetadataReaderRead(&self._reader, &key, &value)
- key_obj = PyBytes_FromStringAndSize(key.data,
key.size_bytes).decode('UTF-8')
+ key_obj = PyBytes_FromStringAndSize(key.data, key.size_bytes)
value_obj = PyBytes_FromStringAndSize(value.data, value.size_bytes)
yield key_obj, value_obj
diff --git a/python/src/nanoarrow/_repr_utils.py
b/python/src/nanoarrow/_repr_utils.py
index bd090af5..2c3ce0da 100644
--- a/python/src/nanoarrow/_repr_utils.py
+++ b/python/src/nanoarrow/_repr_utils.py
@@ -54,7 +54,7 @@ def schema_repr(schema, indent=0):
lines.append(f"{indent_str}- metadata: NULL")
else:
lines.append(f"{indent_str}- metadata:")
- for key, value in metadata:
+ for key, value in metadata.items():
lines.append(f"{indent_str} - {repr(key)}: {repr(value)}")
if schema.dictionary:
diff --git a/python/src/nanoarrow/c_lib.py b/python/src/nanoarrow/c_lib.py
index 0cd5a734..6c1ff2b8 100644
--- a/python/src/nanoarrow/c_lib.py
+++ b/python/src/nanoarrow/c_lib.py
@@ -681,7 +681,11 @@ def _c_buffer_from_iterable(obj, schema=None) -> CBuffer:
raise ValueError("CBuffer from iterable requires schema")
schema_view = c_schema_view(schema)
- if schema_view.storage_type_id != schema_view.type_id:
+ if schema_view.extension_name is not None:
+ raise ValueError(
+ f"Can't create buffer from extension type
{schema_view.extension_name}"
+ )
+ elif schema_view.storage_type_id != schema_view.type_id:
raise ValueError(
f"Can't create buffer from iterable for type {schema_view.type}"
)
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index b0f26a27..08ec67f9 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -124,6 +124,10 @@ class LossyConversionWarning(UserWarning):
pass
+class UnregisteredExtensionWarning(UserWarning):
+ pass
+
+
class ArrayViewBaseIterator:
"""Base class for iterators that use an internal ArrowArrayView
as the basis for conversion to Python objects. Intended for internal use.
@@ -195,6 +199,17 @@ class PyIterator(ArrayViewBaseIterator):
"""
def _iter_chunk(self, offset, length):
+ # Check for an extension type first since this isn't reflected by
+ # self._schema_view.type_id. Currently we just return the storage
+ # iterator with a warning for extension types.
+ maybe_extension_name = self._schema_view.extension_name
+ if maybe_extension_name:
+ self._warn(
+ f"Converting unregistered extension '{maybe_extension_name}' "
+ "as storage type",
+ UnregisteredExtensionWarning,
+ )
+
type_id = self._schema_view.type_id
if type_id not in _ITEMS_ITER_LOOKUP:
raise KeyError(
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index 4aaad073..e521a342 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -17,9 +17,16 @@
import enum
import reprlib
-from typing import Union
-
-from nanoarrow._lib import CArrowTimeUnit, CArrowType, CSchemaBuilder,
CSchemaView
+from functools import cached_property
+from typing import Mapping, Union
+
+from nanoarrow._lib import (
+ CArrowTimeUnit,
+ CArrowType,
+ CSchemaBuilder,
+ CSchemaView,
+ SchemaMetadata,
+)
from nanoarrow.c_lib import c_schema
@@ -108,10 +115,75 @@ class TimeUnit(enum.Enum):
return TimeUnit(obj)
+class ExtensionAccessor:
+ """Accessor for extension type parameters"""
+
+ def __init__(self, schema) -> None:
+ self._schema = schema
+
+ @property
+ def name(self) -> str:
+ """Extension name for this extension type"""
+ return self._schema._c_schema_view.extension_name
+
+ @property
+ def metadata(self) -> Union[bytes, None]:
+ """Extension metadata for this extension type if present"""
+ extension_metadata = self._schema._c_schema_view.extension_metadata
+ return extension_metadata if extension_metadata else None
+
+ @property
+ def storage(self):
+ """Storage type for this extension type"""
+ metadata = dict(self._schema.metadata.items())
+
+ # Remove metadata keys that cause this type to be treated as an
extension
+ del metadata[b"ARROW:extension:name"]
+ if b"ARROW:extension:metadata" in metadata:
+ del metadata[b"ARROW:extension:metadata"]
+
+ return Schema(self._schema, metadata=metadata)
+
+
class Schema:
- """The Schema is nanoarrow's high-level data type representation whose
scope maps to
- that of the ArrowSchema in the Arrow C Data interface. See :func:`schema`
for class
- details.
+ """Create a nanoarrow Schema
+
+ The Schema is nanoarrow's high-level data type representation, encompassing
+ the role of PyArrow's ``Schema``, ``Field``, and ``DataType``. This scope
+ maps to that of the ArrowSchema in the Arrow C Data interface.
+
+ Parameters
+ ----------
+ obj :
+ A :class:`Type` specifier or a schema-like object. A schema-like object
+ includes:
+ * A ``pyarrow.Schema``, `pyarrow.Field``, or ``pyarrow.DataType``
+ * A nanoarrow :class:`Schema`, :class:`CSchema`, or :class:`Type`
+ * Any object implementing the Arrow PyCapsule interface protocol
method.
+
+ name : str, optional
+ An optional name to bind to this field.
+
+ nullable : bool, optional
+ Explicitly specify field nullability. Fields are nullable by default.
+
+ metadata : mapping, optional
+ Explicitly specify field metadata.
+
+ params :
+ Type-specific parameters when ``obj`` is a :class:`Type`.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> import pyarrow as pa
+ >>> na.Schema(na.Type.INT32)
+ Schema(INT32)
+ >>> na.Schema(na.Type.DURATION, unit=na.TimeUnit.SECOND)
+ Schema(DURATION, unit=SECOND)
+ >>> na.Schema(pa.int32())
+ Schema(INT32)
"""
def __init__(
@@ -120,18 +192,24 @@ class Schema:
*,
name=None,
nullable=None,
+ metadata=None,
**params,
) -> None:
if isinstance(obj, Type):
- self._c_schema = _c_schema_from_type_and_params(obj, params, name,
nullable)
- elif not params and nullable is None and name is None:
- self._c_schema = c_schema(obj)
- else:
- # A future version could also deep copy the schema and update it
if these
- # values *are* specified.
- raise ValueError(
- "params, nullable, and name must be unspecified if type is not
"
- "nanoarrow.Type"
+ self._c_schema = _c_schema_from_type_and_params(
+ obj, params, name, nullable, metadata
+ )
+ self._c_schema_view = CSchemaView(self._c_schema)
+ return
+
+ if params:
+ raise ValueError("params are only supported for obj of class Type")
+
+ self._c_schema = c_schema(obj)
+
+ if name is not None or nullable is not None or metadata is not None:
+ self._c_schema = self._c_schema.modify(
+ name=name, nullable=nullable, metadata=metadata
)
self._c_schema_view = CSchemaView(self._c_schema)
@@ -144,7 +222,10 @@ class Schema:
>>> na.int32().type
<Type.INT32: 8>
"""
- return Type(self._c_schema_view.type_id)
+ if self._c_schema_view.extension_name:
+ return Type.EXTENSION
+ else:
+ return Type(self._c_schema_view.type_id)
@property
def name(self) -> Union[str, None]:
@@ -169,6 +250,35 @@ class Schema:
"""
return self._c_schema_view.nullable
+ @cached_property
+ def metadata(self) -> Mapping[bytes, bytes]:
+ """Access field metadata of this field
+
+ >>> import nanoarrow as na
+ >>> schema = na.Schema(na.int32(), metadata={"key": "value"})
+ >>> dict(schema.metadata.items())
+ {b'key': b'value'}
+ """
+ c_schema_metadata = self._c_schema.metadata
+ return (
+ SchemaMetadata.empty() if c_schema_metadata is None else
c_schema_metadata
+ )
+
+ @cached_property
+ def extension(self) -> Union[ExtensionAccessor, None]:
+ """Access extension type attributes
+
+ >>> import nanoarrow as na
+ >>> schema = na.extension_type(na.int32(), "arrow.example", b"{}")
+ >>> schema.extension.name
+ 'arrow.example'
+ >>> schema.extension.metadata
+ b'{}'
+ """
+ extension_name = self._c_schema_view.extension_name
+ if extension_name:
+ return ExtensionAccessor(self)
+
@property
def byte_width(self) -> Union[int, None]:
"""Element byte width for fixed-size binary type
@@ -280,48 +390,6 @@ class Schema:
return self._c_schema.__arrow_c_schema__()
-def schema(obj, *, name=None, nullable=None, **params):
- """Create a nanoarrow Schema
-
- The Schema is nanoarrow's high-level data type representation, encompasing
- the role of PyArrow's ``Schema``, ``Field``, and ``DataType``. This scope
- maps to that of the ArrowSchema in the Arrow C Data interface.
-
- Parameters
- ----------
- obj :
- A :class:`Type` specifier or a schema-like object. A schema-like object
- includes:
- * A ``pyarrow.Schema``, `pyarrow.Field``, or ``pyarrow.DataType``
- * A nanoarrow :class:`Schema`, :class:`CSchema`, or :class:`Type`
- * Any object implementing the Arrow PyCapsule interface protocol
method.
-
- name : str, optional
- An optional name to bind to this field.
-
- nullable : bool, optional
- Explicitly specify field nullability. Fields are nullable by default.
- Only supported if ``obj`` is a :class:`Type` object (for any other
input,
- the nullability is preserved from the passed object).
-
- params :
- Type-specific parameters when ``obj`` is a :class:`Type`.
-
- Examples
- --------
-
- >>> import nanoarrow as na
- >>> import pyarrow as pa
- >>> na.schema(na.Type.INT32)
- Schema(INT32)
- >>> na.schema(na.Type.DURATION, unit=na.TimeUnit.SECOND)
- Schema(DURATION, unit=SECOND)
- >>> na.schema(pa.int32())
- Schema(INT32)
- """
- return Schema(obj, name=name, nullable=nullable, **params)
-
-
def null(nullable: bool = True) -> Schema:
"""Create an instance of a null type.
@@ -897,11 +965,40 @@ def struct(fields, nullable=True) -> Schema:
return Schema(Type.STRUCT, fields=fields, nullable=nullable)
+def extension_type(
+ storage_schema,
+ extension_name: str,
+ extension_metadata: Union[str, bytes, None] = None,
+ nullable: bool = True,
+) -> Schema:
+ """Create an Arrow extension type
+
+ Parameters
+ ----------
+ extension_name: str
+ The extension name to associate with this type.
+ extension_metadata: str or bytes, optional
+ Extension metadata containing extension parameters associated with this
+ extension type.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+ """
+ storage_schema = c_schema(storage_schema)
+ storage_metadata = storage_schema.metadata
+ metadata = dict(storage_metadata) if storage_metadata else {}
+ metadata["ARROW:extension:name"] = extension_name
+ if extension_metadata:
+ metadata["ARROW:extension:metadata"] = extension_metadata
+
+ return Schema(storage_schema, nullable=nullable, metadata=metadata)
+
+
def _c_schema_from_type_and_params(
type: Type,
params: dict,
name: Union[bool, None, bool],
nullable: Union[bool, None],
+ metadata: Mapping[Union[str, bytes], Union[str, bytes]],
):
factory = CSchemaBuilder.allocate()
@@ -952,6 +1049,10 @@ def _c_schema_from_type_and_params(
name = None
factory.set_name(name)
+ # Apply metadata
+ if metadata is not None:
+ factory.append_metadata(metadata)
+
return factory.finish()
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 0c79de17..68277921 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -264,6 +264,9 @@ def test_c_buffer_from_iterable():
with pytest.raises(ValueError, match="Can't create buffer"):
na.c_buffer([1, 2, 3], na.date32())
+ with pytest.raises(ValueError, match="Can't create buffer"):
+ na.c_buffer([1, 2, 3], na.extension_type(na.null(), "arrow.test"))
+
def test_c_buffer_from_fixed_size_binary_iterable():
items = [b"abcd", b"efgh", b"ijkl"]
diff --git a/python/tests/test_c_schema.py b/python/tests/test_c_schema.py
new file mode 100644
index 00000000..5617fe7b
--- /dev/null
+++ b/python/tests/test_c_schema.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import nanoarrow as na
+
+
+def test_c_schema_basic():
+ schema = na.allocate_c_schema()
+ assert schema.is_valid() is False
+ assert schema._to_string() == "[invalid: schema is released]"
+ assert repr(schema) == "<nanoarrow.c_lib.CSchema <released>>"
+
+ schema = na.c_schema(na.struct({"some_name": na.int32()}))
+
+ assert schema.format == "+s"
+ assert schema.flags == 2
+ assert schema.metadata is None
+ assert schema.n_children == 1
+ assert len(list(schema.children)) == 1
+ assert schema.child(0).format == "i"
+ assert schema.child(0).name == "some_name"
+ assert schema.child(0)._to_string() == "int32"
+ assert "<nanoarrow.c_lib.CSchema int32>" in repr(schema)
+ assert schema.dictionary is None
+
+ with pytest.raises(IndexError):
+ schema.child(1)
+
+
+def test_c_schema_dictionary():
+ pa = pytest.importorskip("pyarrow")
+
+ schema = na.c_schema(pa.dictionary(pa.int32(), pa.utf8()))
+ assert schema.format == "i"
+ assert schema.dictionary.format == "u"
+ assert "dictionary: <nanoarrow.c_lib.CSchema string" in repr(schema)
+
+
+def test_schema_metadata():
+ meta = {"key1": "value1", "key2": "value2"}
+ schema = na.c_schema(na.int32()).modify(metadata=meta)
+
+ assert len(schema.metadata) == 2
+
+ meta2 = {k: v for k, v in schema.metadata.items()}
+ assert list(meta2.keys()) == [b"key1", b"key2"]
+ assert list(meta2.values()) == [b"value1", b"value2"]
+ assert "b'key1': b'value1'" in repr(schema)
+
+
+def test_c_schema_view():
+ schema = na.allocate_c_schema()
+ with pytest.raises(RuntimeError):
+ na.c_schema_view(schema)
+
+ schema = na.c_schema(na.int32())
+ view = na.c_schema_view(schema)
+ assert "- type: 'int32'" in repr(view)
+ assert view.type == "int32"
+ assert view.storage_type == "int32"
+
+ assert view.fixed_size is None
+ assert view.decimal_bitwidth is None
+ assert view.decimal_scale is None
+ assert view.time_unit is None
+ assert view.timezone is None
+ assert view.union_type_ids is None
+ assert view.extension_name is None
+ assert view.extension_metadata is None
+
+
+def test_c_schema_view_extra_params():
+ view = na.c_schema_view(na.fixed_size_binary(12))
+ assert view.fixed_size == 12
+
+ view = na.c_schema_view(na.decimal128(10, 3))
+ assert view.decimal_bitwidth == 128
+ assert view.decimal_precision == 10
+ assert view.decimal_scale == 3
+
+ view = na.c_schema_view(na.decimal256(10, 3))
+ assert view.decimal_bitwidth == 256
+ assert view.decimal_precision == 10
+ assert view.decimal_scale == 3
+
+ view = na.c_schema_view(na.duration("us"))
+ assert view.time_unit == "us"
+
+ view = na.c_schema_view(na.timestamp("us", "America/Halifax"))
+ assert view.type == "timestamp"
+ assert view.storage_type == "int64"
+ assert view.time_unit == "us"
+ assert view.timezone == "America/Halifax"
+
+ pa = pytest.importorskip("pyarrow")
+
+ view = na.c_schema_view(pa.list_(pa.int32(), 12))
+ assert view.fixed_size == 12
+
+
+def test_c_schema_metadata():
+ meta = {
+ "ARROW:extension:name": "some_name",
+ "ARROW:extension:metadata": "some_metadata",
+ }
+
+ schema = na.c_schema(na.int32()).modify(metadata=meta)
+ view = na.c_schema_view(schema)
+ assert view.extension_name == "some_name"
+ assert view.extension_metadata == b"some_metadata"
+
+
+def test_c_schema_modify():
+ schema = na.c_schema(na.null())
+
+ schema_clone = schema.modify()
+ assert schema_clone is not schema
+ assert schema._addr() != schema_clone._addr()
+
+ schema_named = schema.modify(name="something else")
+ assert schema_named.name == "something else"
+ assert schema_named.format == schema.format
+
+ schema_flagged = schema.modify(flags=0)
+ assert schema_flagged.flags == 0
+ assert schema_flagged.format == schema.format
+
+ schema_non_nullable = schema.modify(nullable=False)
+ assert schema_non_nullable.flags == 0
+ assert schema_non_nullable.format == schema.format
+
+ meta = {"some key": "some value"}
+ schema_metad = schema.modify(metadata=meta)
+ assert list(schema_metad.metadata.items()) == [(b"some key", b"some
value")]
+ assert schema_non_nullable.format == schema.format
+
+ schema_metad2 = schema.modify(metadata=schema_metad.metadata)
+ assert list(schema_metad2.metadata.items()) == [(b"some key", b"some
value")]
+
+ schema_no_metad = schema_metad.modify(metadata={})
+ assert schema_no_metad.metadata is None
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 4b527b13..d43813fd 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -23,6 +23,7 @@ from nanoarrow.iterator import (
ArrayViewBaseIterator,
InvalidArrayWarning,
LossyConversionWarning,
+ UnregisteredExtensionWarning,
iter_array_views,
iter_py,
iter_tuples,
@@ -493,3 +494,13 @@ def test_iterator_duration():
items[0] = datetime.timedelta(days=-12, seconds=-345)
array = pa.array(items, pa.duration("s"))
assert list(iter_py(array)) == items
+
+
+def test_iterator_extension():
+ schema = na.extension_type(na.int32(), "arrow.test")
+ storage_array = na.c_array([1, 2, 3], na.int32())
+ _, storage_array_capsule = na.c_array(storage_array).__arrow_c_array__()
+ extension_array = na.c_array(storage_array_capsule, schema)
+
+ with pytest.warns(UnregisteredExtensionWarning):
+ assert list(iter_py(extension_array)) == [1, 2, 3]
diff --git a/python/tests/test_nanoarrow.py b/python/tests/test_nanoarrow.py
index 689905eb..9ee48463 100644
--- a/python/tests/test_nanoarrow.py
+++ b/python/tests/test_nanoarrow.py
@@ -68,110 +68,6 @@ def test_array_view_helper():
assert na.c_array_view(view) is view
-def test_c_schema_basic():
- schema = na.allocate_c_schema()
- assert schema.is_valid() is False
- assert schema._to_string() == "[invalid: schema is released]"
- assert repr(schema) == "<nanoarrow.c_lib.CSchema <released>>"
-
- schema = na.c_schema(pa.schema([pa.field("some_name", pa.int32())]))
-
- assert schema.format == "+s"
- assert schema.flags == 0
- assert schema.metadata is None
- assert schema.n_children == 1
- assert len(list(schema.children)) == 1
- assert schema.child(0).format == "i"
- assert schema.child(0).name == "some_name"
- assert schema.child(0)._to_string() == "int32"
- assert "<nanoarrow.c_lib.CSchema int32>" in repr(schema)
- assert schema.dictionary is None
-
- with pytest.raises(IndexError):
- schema.child(1)
-
-
-def test_c_schema_dictionary():
- schema = na.c_schema(pa.dictionary(pa.int32(), pa.utf8()))
- assert schema.format == "i"
- assert schema.dictionary.format == "u"
- assert "dictionary: <nanoarrow.c_lib.CSchema string" in repr(schema)
-
-
-def test_schema_metadata():
- meta = {"key1": "value1", "key2": "value2"}
- schema = na.c_schema(pa.field("", pa.int32(), metadata=meta))
-
- assert len(schema.metadata) == 2
-
- meta2 = {k: v for k, v in schema.metadata}
- assert list(meta2.keys()) == ["key1", "key2"]
- assert list(meta2.values()) == [b"value1", b"value2"]
- assert "'key1': b'value1'" in repr(schema)
-
-
-def test_c_schema_view():
- schema = na.allocate_c_schema()
- with pytest.raises(RuntimeError):
- na.c_schema_view(schema)
-
- schema = na.c_schema(pa.int32())
- view = na.c_schema_view(schema)
- assert "- type: 'int32'" in repr(view)
- assert view.type == "int32"
- assert view.storage_type == "int32"
-
- assert view.fixed_size is None
- assert view.decimal_bitwidth is None
- assert view.decimal_scale is None
- assert view.time_unit is None
- assert view.timezone is None
- assert view.union_type_ids is None
- assert view.extension_name is None
- assert view.extension_metadata is None
-
-
-def test_c_schema_view_extra_params():
- schema = na.c_schema(pa.binary(12))
- view = na.c_schema_view(schema)
- assert view.fixed_size == 12
-
- schema = na.c_schema(pa.list_(pa.int32(), 12))
- assert view.fixed_size == 12
-
- schema = na.c_schema(pa.decimal128(10, 3))
- view = na.c_schema_view(schema)
- assert view.decimal_bitwidth == 128
- assert view.decimal_precision == 10
- assert view.decimal_scale == 3
-
- schema = na.c_schema(pa.decimal256(10, 3))
- view = na.c_schema_view(schema)
- assert view.decimal_bitwidth == 256
- assert view.decimal_precision == 10
- assert view.decimal_scale == 3
-
- schema = na.c_schema(pa.duration("us"))
- view = na.c_schema_view(schema)
- assert view.time_unit == "us"
-
- schema = na.c_schema(pa.timestamp("us", tz="America/Halifax"))
- view = na.c_schema_view(schema)
- assert view.type == "timestamp"
- assert view.storage_type == "int64"
- assert view.time_unit == "us"
- assert view.timezone == "America/Halifax"
-
- meta = {
- "ARROW:extension:name": "some_name",
- "ARROW:extension:metadata": "some_metadata",
- }
- schema = na.c_schema(pa.field("", pa.int32(), metadata=meta))
- view = na.c_schema_view(schema)
- assert view.extension_name == "some_name"
- assert view.extension_metadata == b"some_metadata"
-
-
def test_c_array_empty():
array = na.allocate_c_array()
assert array.is_valid() is False
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index bc27214b..cc9e42b2 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -37,19 +37,26 @@ def test_time_unit_create():
def test_schema_create_c_schema():
schema_obj = na.int32()
assert schema_obj.type == na.Type.INT32
+ assert schema_obj.name == ""
+ assert ("some key" in schema_obj.metadata) is False
schema_obj2 = na.Schema(schema_obj._c_schema)
assert schema_obj2.type == schema_obj2.type
assert schema_obj2._c_schema is schema_obj._c_schema
- with pytest.raises(ValueError, match="must be unspecified"):
- na.Schema(schema_obj._c_schema, some_parameter="some_value")
+ schema_obj_non_nullable = na.Schema(na.int32(), nullable=False)
+ assert schema_obj_non_nullable.nullable is False
+
+ schema_named = na.Schema(na.int32(), name="some_name")
+ assert schema_named.name == "some_name"
- with pytest.raises(ValueError, match="must be unspecified"):
- na.Schema(schema_obj._c_schema, nullable=True)
+ schema_metad = na.Schema(na.int32(), metadata={"some key": "some value"})
+ assert b"some key" in schema_metad.metadata
+ assert schema_metad.metadata[b"some key"] == b"some value"
+ assert dict(schema_metad.metadata.items()) == {b"some key": b"some value"}
- with pytest.raises(ValueError, match="must be unspecified"):
- na.Schema(schema_obj._c_schema, name="")
+ with pytest.raises(ValueError):
+ na.Schema(schema_obj._c_schema, some_parameter="some_value")
def test_schema_create_no_params():
@@ -183,3 +190,34 @@ def test_schema_struct():
assert schema_obj.type == na.Type.STRUCT
assert schema_obj.field(0).type == na.Type.INT32
assert schema_obj.field(0).name == "col_name"
+
+
+def test_schema_extension():
+ schema_obj = na.int32()
+ assert schema_obj.extension is None
+
+ # Check with metadata manually added
+ schema_obj = na.Schema(
+ na.int32(),
+ metadata={
+ "ARROW:extension:name": "arrow.test",
+ "ARROW:extension:metadata": "abcdefg",
+ },
+ )
+ assert schema_obj.extension.name == "arrow.test"
+ assert schema_obj.extension.metadata == b"abcdefg"
+
+ # Check from extension_type constructor
+ schema_obj = na.extension_type(na.int32(), "arrow.test", "abcdefg")
+ assert schema_obj.type == na.Type.EXTENSION
+ assert schema_obj.extension is not None
+ assert schema_obj.extension.name == "arrow.test"
+ assert schema_obj.extension.metadata == b"abcdefg"
+ assert schema_obj.extension.storage.type == na.Type.INT32
+ assert schema_obj.nullable is True
+
+ schema_obj = na.extension_type(na.int32(), "arrow.test", nullable=False)
+ assert schema_obj.extension.name == "arrow.test"
+ assert schema_obj.extension.metadata is None
+ assert schema_obj.extension.storage.type == na.Type.INT32
+ assert schema_obj.nullable is False