This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new f883fbbc feat(python): Support union types in Python bindings (#820)
f883fbbc is described below
commit f883fbbcfe3f80c722837fde67a603433c707c0a
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Oct 29 16:45:56 2025 -0500
feat(python): Support union types in Python bindings (#820)
```python
import nanoarrow as na
schema = na.dense_union(
[na.bool_(), na.int32(), na.float64(), na.string()]
)
c_array = na.c_array_from_buffers(
schema,
length=6,
null_count=0,
buffers=[na.c_buffer([0, 1, 2, 3, 0, 0], na.int8()), na.c_buffer([0, 0,
0, 0, 1, 2], na.int32())],
children=[
na.c_array([True, None, False], na.bool_()),
na.c_array([123], na.int32()),
na.c_array([456.0], na.float64()),
na.c_array(["789"], na.string()),
],
)
na.Array(c_array).to_pylist()
#> [True, 123, 456.0, '789', None, False]
```
Closes #716.
---------
Co-authored-by: Copilot <[email protected]>
---
python/src/nanoarrow/__init__.py | 88 ++++++++++++++++++++------------------
python/src/nanoarrow/iterator.py | 62 ++++++++++++++++++++++++++-
python/src/nanoarrow/schema.py | 92 +++++++++++++++++++++++++++++++++++++++-
python/tests/test_iterator.py | 74 ++++++++++++++++++++++++++++++++
python/tests/test_schema.py | 20 +++++++++
5 files changed, 292 insertions(+), 44 deletions(-)
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 62221eeb..2d010c72 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -31,48 +31,50 @@ from nanoarrow.c_schema import c_schema
from nanoarrow.c_buffer import c_buffer
from nanoarrow.extension_canonical import bool8
from nanoarrow.schema import (
- Schema,
- Type,
- TimeUnit,
- null,
+ binary_view,
+ binary,
bool_,
- int8,
- uint8,
- int16,
- uint16,
- int32,
- uint32,
- int64,
- uint64,
+ date32,
+ date64,
+ decimal128,
+ decimal256,
+ dense_union,
+ dictionary,
+ duration,
+ extension_type,
+ fixed_size_binary,
+ fixed_size_list,
float16,
float32,
float64,
- string,
+ int16,
+ int32,
+ int64,
+ int8,
+ interval_day_time,
+ interval_month_day_nano,
+ interval_months,
+ large_binary,
+ large_list,
large_string,
- string_view,
list_,
- large_list,
- fixed_size_list,
map_,
- dictionary,
- binary,
- large_binary,
- binary_view,
- fixed_size_binary,
- date32,
- date64,
+ null,
+ schema,
+ Schema,
+ sparse_union,
+ string_view,
+ string,
+ struct,
time32,
time64,
timestamp,
- extension_type,
- duration,
- interval_months,
- interval_day_time,
- interval_month_day_nano,
- decimal128,
- decimal256,
- schema,
- struct,
+ TimeUnit,
+ Type,
+ uint16,
+ uint32,
+ uint64,
+ uint8,
)
from nanoarrow.array import array, Array
from nanoarrow.array_stream import ArrayStream
@@ -81,17 +83,16 @@ from nanoarrow._version import __version__ # noqa: F401
# Helps Sphinx automatically populate an API reference section
__all__ = [
+ "array",
+ "Array",
"ArrayStream",
- "Schema",
- "TimeUnit",
- "Type",
- "binary",
"binary_view",
+ "binary",
"bool_",
"bool8",
- "c_array",
"c_array_from_buffers",
"c_array_stream",
+ "c_array",
"c_buffer",
"c_schema",
"c_version",
@@ -99,6 +100,7 @@ __all__ = [
"date64",
"decimal128",
"decimal256",
+ "dense_union",
"dictionary",
"duration",
"extension_type",
@@ -115,25 +117,27 @@ __all__ = [
"interval_month_day_nano",
"interval_months",
"large_binary",
- "large_string",
"large_list",
+ "large_string",
"list_",
"map_",
"null",
"nulls_as_sentinel",
"nulls_forbid",
"nulls_separate",
- "string",
+ "schema",
+ "Schema",
+ "sparse_union",
"string_view",
+ "string",
"struct",
- "schema",
"time32",
"time64",
"timestamp",
+ "TimeUnit",
+ "Type",
"uint16",
"uint32",
"uint64",
"uint8",
- "Array",
- "array",
]
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index f2e61615..6b66e9eb 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -17,7 +17,7 @@
import warnings
from functools import cached_property
-from itertools import islice, repeat
+from itertools import groupby, islice, repeat
from typing import Iterable, Tuple
from nanoarrow._array import CArrayView
@@ -297,6 +297,64 @@ class PyIterator(ArrayViewBaseIterator):
for _ in range(length):
yield list(islice(child_iter, fixed_size))
+ def _sparse_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in enumerate(type_codes)
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = len(list(item_type_id_iter))
+ child = self._children[child_index_by_type_id[item_type_id]]
+ yield from child._iter_chunk(i, type_id_run_length)
+
+ i += type_id_run_length
+
+ def _dense_union_iter(self, offset, length):
+ view = self._array_view
+ offset += view.offset
+
+ type_codes = self.schema.type_codes
+ child_index_by_type_id = {
+ member_id: i for i, member_id in enumerate(type_codes)
+ }
+
+ type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+ offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
+
+ # Try as hard as we can to reduce the number of times we request a
child
+ # iterator by iterating over runs of consecutive type_ids
+ i = 0
+ for item_type_id, item_type_id_iter in groupby(type_id):
+ type_id_run_length = len(list(item_type_id_iter))
+ child_offsets = offsets[i : (i + type_id_run_length)]
+ child_offset0 = child_offsets[0]
+
+ # This only works if there are no missing elements (i.e., for
sequences
+ # of an identical type_id, the elements must be sequential and
increasing).
+ # The spec specifies the sequential/increasing nature of these
offsets but
+ # we check to be sure.
+ if (child_offsets[-1] - child_offset0) != (type_id_run_length - 1):
+ raise ValueError(
+ f"Child offsets for type_id {item_type_id} are not
sequential: "
+ f"{list(child_offsets)} / {type_id_run_length}"
+ )
+
+ child_index = child_index_by_type_id[item_type_id]
+ yield from self._children[child_index]._iter_chunk(
+ child_offset0, type_id_run_length
+ )
+
+ i += type_id_run_length
+
def _string_iter(self, offset, length):
view = self._array_view
offset += view.offset
@@ -568,6 +626,8 @@ _ITEMS_ITER_LOOKUP = {
_types.LIST: "_list_iter",
_types.LARGE_LIST: "_list_iter",
_types.FIXED_SIZE_LIST: "_fixed_size_list_iter",
+ _types.SPARSE_UNION: "_sparse_union_iter",
+ _types.DENSE_UNION: "_dense_union_iter",
_types.DICTIONARY: "_dictionary_iter",
_types.DATE32: "_date_iter",
_types.DATE64: "_date_iter",
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index 99616eee..c9d73679 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -18,7 +18,7 @@
import enum
import reprlib
from functools import cached_property
-from typing import List, Mapping, Union
+from typing import List, Mapping, Optional, Union
from nanoarrow._schema import (
CArrowTimeUnit,
@@ -467,6 +467,19 @@ class Schema:
else:
return None
+ @property
+ def type_codes(self) -> Optional[List[int]]:
+ """Union type identifiers
+
+ >>> import nanoarrow as na
+ >>> na.dense_union([na.int32(), na.string()]).type_codes
+ [0, 1]
+ """
+ if self._c_schema_view.type_id in (_types.SPARSE_UNION,
_types.DENSE_UNION):
+ return list(self._c_schema_view.union_type_ids)
+ else:
+ return None
+
@property
def n_fields(self) -> int:
"""Number of child Schemas
@@ -1290,6 +1303,71 @@ def dictionary(index_type, value_type,
dictionary_ordered: bool = False) -> Sche
)
+def sparse_union(
+ fields, type_codes: Optional[List[int]] = None, nullable: bool = True
+) -> Schema:
+ """Create a type where an element could be one of several pre-defined types
+
+ Parameters
+ ----------
+ fields :
+ * A dictionary whose keys are field names and values are schema-like
objects
+ * An iterable whose items are a schema like objects where the field
name is
+ inherited from the schema-like object.
+ type_codes : Specific numeric identifiers attached to each field (must be
between
+ 0 and 127, inclusive). When missing, these are generated as a sequence
along
+ ``fields``.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.sparse_union([na.int32(), na.string()])
+ <Schema> sparse_union([0,1])<: int32, : string>
+ """
+ if type_codes is None:
+ type_codes = list(range(len(fields)))
+ return Schema(
+ Type.SPARSE_UNION, fields=fields, type_codes=type_codes,
nullable=nullable
+ )
+
+
+def dense_union(
+ fields, type_codes: Optional[List[int]] = None, nullable: bool = True
+) -> Schema:
+ """Create a type where an element could be one of several pre-defined types
+
+ A dense union has a more compact (but more complex) representation than a
+ sparse union. Most Arrow unions in use are dense unions.
+
+ Parameters
+ ----------
+ fields :
+ * A dictionary whose keys are field names and values are schema-like
objects
+ * An iterable whose items are a schema like objects where the field
name is
+ inherited from the schema-like object.
+ type_codes : Specific numeric identifiers attached to each field (must be
between
+ 0 and 127, inclusive). When missing, these are generated as a sequence
along
+ ``fields``.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.dense_union([na.int32(), na.string()])
+ <Schema> dense_union([0,1])<: int32, : string>
+ """
+ if type_codes is None:
+ type_codes = list(range(len(fields)))
+ return Schema(
+ Type.DENSE_UNION, fields=fields, type_codes=type_codes,
nullable=nullable
+ )
+
+
def extension_type(
storage_schema,
extension_name: str,
@@ -1387,6 +1465,16 @@ def _c_schema_from_type_and_params(type: Type, params:
dict):
if "dictionary_ordered" in params and
bool(params.pop("dictionary_ordered")):
factory.set_dictionary_ordered(True)
+ elif type == Type.SPARSE_UNION:
+ type_codes = params.pop("type_codes")
+ type_codes_str = ",".join(str(code) for code in type_codes)
+ factory.set_format(f"+us:{type_codes_str}")
+
+ elif type == Type.DENSE_UNION:
+ type_codes = params.pop("type_codes")
+ type_codes_str = ",".join(str(code) for code in type_codes)
+ factory.set_format(f"+ud:{type_codes_str}")
+
else:
factory.set_type(type.value)
@@ -1457,4 +1545,6 @@ _PARAM_NAMES = {
_types.LARGE_LIST: ("value_type",),
_types.FIXED_SIZE_LIST: ("value_type", "list_size"),
_types.DICTIONARY: ("index_type", "value_type", "dictionary_ordered"),
+ _types.SPARSE_UNION: ("fields", "type_codes"),
+ _types.DENSE_UNION: ("fields", "type_codes"),
}
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 0c0e0474..8e62027b 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -530,3 +530,77 @@ def test_iterator_extension():
def test_iterator_null():
array = na.c_array_from_buffers(na.null(), 3, [])
assert list(iter_py(array)) == [None, None, None]
+
+
+def test_iterator_sparse_union():
+ children = [
+ na.c_array([True, None, None, None, None, False], na.bool_()),
+ na.c_array([None, 123, None, None, None, None], na.int32()),
+ na.c_array([None, None, 456.0, None, None, None], na.float64()),
+ na.c_array([None, None, None, "789", None, None], na.string()),
+ ]
+
+ # Check with the default sequential type codes
+ schema = na.sparse_union([na.bool_(), na.int32(), na.float64(),
na.string()])
+ c_array = na.c_array_from_buffers(
+ schema,
+ length=6,
+ null_count=0,
+ buffers=[na.c_buffer([0, 1, 2, 3, 0, 0], na.int8())],
+ children=children,
+ )
+
+ assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
+
+ # Check with custom type codes
+ schema = na.sparse_union(
+ [na.bool_(), na.int32(), na.float64(), na.string()], type_codes=[3, 2,
1, 0]
+ )
+ c_array = na.c_array_from_buffers(
+ schema,
+ length=6,
+ null_count=0,
+ buffers=[na.c_buffer([3, 2, 1, 0, 3, 3], na.int8())],
+ children=children,
+ )
+
+ assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
+
+
+def test_iterator_dense_union():
+ children = [
+ na.c_array([True, None, False], na.bool_()),
+ na.c_array([123], na.int32()),
+ na.c_array([456.0], na.float64()),
+ na.c_array(["789"], na.string()),
+ ]
+
+ schema = na.dense_union([na.bool_(), na.int32(), na.float64(),
na.string()])
+ c_array = na.c_array_from_buffers(
+ schema,
+ length=6,
+ null_count=0,
+ buffers=[
+ na.c_buffer([0, 1, 2, 3, 0, 0], na.int8()),
+ na.c_buffer([0, 0, 0, 0, 1, 2], na.int32()),
+ ],
+ children=children,
+ )
+
+ assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
+
+ schema = na.dense_union(
+ [na.bool_(), na.int32(), na.float64(), na.string()], type_codes=[3, 2,
1, 0]
+ )
+ c_array = na.c_array_from_buffers(
+ schema,
+ length=6,
+ null_count=0,
+ buffers=[
+ na.c_buffer([3, 2, 1, 0, 3, 3], na.int8()),
+ na.c_buffer([0, 0, 0, 0, 1, 2], na.int32()),
+ ],
+ children=children,
+ )
+
+ assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index 7d1f97f3..1a6afc1c 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -222,6 +222,26 @@ def test_schema_dictionary():
assert schema_obj_ordered.dictionary_ordered is True
+def test_schema_union():
+ schema_obj = na.dense_union([na.int32(), na.string()])
+ assert schema_obj.type == na.Type.DENSE_UNION
+ assert schema_obj.type_codes == [0, 1]
+
+ # Make sure custom type_codes come through
+ schema_obj = na.dense_union([na.int32(), na.string()], type_codes=[1, 0])
+ assert schema_obj.type == na.Type.DENSE_UNION
+ assert schema_obj.type_codes == [1, 0]
+
+ schema_obj = na.sparse_union([na.int32(), na.string()])
+ assert schema_obj.type == na.Type.SPARSE_UNION
+ assert schema_obj.type_codes == [0, 1]
+
+ # Make sure custom type_codes come through
+ schema_obj = na.sparse_union([na.int32(), na.string()], type_codes=[1, 0])
+ assert schema_obj.type == na.Type.SPARSE_UNION
+ assert schema_obj.type_codes == [1, 0]
+
+
def test_schema_extension():
schema_obj = na.int32()
assert schema_obj.extension is None