This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f883fbbc feat(python): Support union types in Python bindings (#820)
f883fbbc is described below

commit f883fbbcfe3f80c722837fde67a603433c707c0a
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Oct 29 16:45:56 2025 -0500

    feat(python): Support union types in Python bindings (#820)
    
    ```python
    import nanoarrow as na
    
    schema = na.dense_union(
        [na.bool_(), na.int32(), na.float64(), na.string()]
    )
    
    c_array = na.c_array_from_buffers(
        schema,
        length=6,
        null_count=0,
        buffers=[na.c_buffer([0, 1, 2, 3, 0, 0], na.int8()), na.c_buffer([0, 0, 
0, 0, 1, 2], na.int32())],
        children=[
            na.c_array([True, None, False], na.bool_()),
            na.c_array([123], na.int32()),
            na.c_array([456.0], na.float64()),
            na.c_array(["789"], na.string()),
        ],
    )
    
    na.Array(c_array).to_pylist()
    #> [True, 123, 456.0, '789', None, False]
    ```
    
    Closes #716.
    
    ---------
    
    Co-authored-by: Copilot <[email protected]>
---
 python/src/nanoarrow/__init__.py | 88 ++++++++++++++++++++------------------
 python/src/nanoarrow/iterator.py | 62 ++++++++++++++++++++++++++-
 python/src/nanoarrow/schema.py   | 92 +++++++++++++++++++++++++++++++++++++++-
 python/tests/test_iterator.py    | 74 ++++++++++++++++++++++++++++++++
 python/tests/test_schema.py      | 20 +++++++++
 5 files changed, 292 insertions(+), 44 deletions(-)

diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 62221eeb..2d010c72 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -31,48 +31,50 @@ from nanoarrow.c_schema import c_schema
 from nanoarrow.c_buffer import c_buffer
 from nanoarrow.extension_canonical import bool8
 from nanoarrow.schema import (
-    Schema,
-    Type,
-    TimeUnit,
-    null,
+    binary_view,
+    binary,
     bool_,
-    int8,
-    uint8,
-    int16,
-    uint16,
-    int32,
-    uint32,
-    int64,
-    uint64,
+    date32,
+    date64,
+    decimal128,
+    decimal256,
+    dense_union,
+    dictionary,
+    duration,
+    extension_type,
+    fixed_size_binary,
+    fixed_size_list,
     float16,
     float32,
     float64,
-    string,
+    int16,
+    int32,
+    int64,
+    int8,
+    interval_day_time,
+    interval_month_day_nano,
+    interval_months,
+    large_binary,
+    large_list,
     large_string,
-    string_view,
     list_,
-    large_list,
-    fixed_size_list,
     map_,
-    dictionary,
-    binary,
-    large_binary,
-    binary_view,
-    fixed_size_binary,
-    date32,
-    date64,
+    null,
+    schema,
+    Schema,
+    sparse_union,
+    string_view,
+    string,
+    struct,
     time32,
     time64,
     timestamp,
-    extension_type,
-    duration,
-    interval_months,
-    interval_day_time,
-    interval_month_day_nano,
-    decimal128,
-    decimal256,
-    schema,
-    struct,
+    TimeUnit,
+    Type,
+    uint16,
+    uint32,
+    uint64,
+    uint8,
 )
 from nanoarrow.array import array, Array
 from nanoarrow.array_stream import ArrayStream
@@ -81,17 +83,16 @@ from nanoarrow._version import __version__  # noqa: F401
 
 # Helps Sphinx automatically populate an API reference section
 __all__ = [
+    "array",
+    "Array",
     "ArrayStream",
-    "Schema",
-    "TimeUnit",
-    "Type",
-    "binary",
     "binary_view",
+    "binary",
     "bool_",
     "bool8",
-    "c_array",
     "c_array_from_buffers",
     "c_array_stream",
+    "c_array",
     "c_buffer",
     "c_schema",
     "c_version",
@@ -99,6 +100,7 @@ __all__ = [
     "date64",
     "decimal128",
     "decimal256",
+    "dense_union",
     "dictionary",
     "duration",
     "extension_type",
@@ -115,25 +117,27 @@ __all__ = [
     "interval_month_day_nano",
     "interval_months",
     "large_binary",
-    "large_string",
     "large_list",
+    "large_string",
     "list_",
     "map_",
     "null",
     "nulls_as_sentinel",
     "nulls_forbid",
     "nulls_separate",
-    "string",
+    "schema",
+    "Schema",
+    "sparse_union",
     "string_view",
+    "string",
     "struct",
-    "schema",
     "time32",
     "time64",
     "timestamp",
+    "TimeUnit",
+    "Type",
     "uint16",
     "uint32",
     "uint64",
     "uint8",
-    "Array",
-    "array",
 ]
diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
index f2e61615..6b66e9eb 100644
--- a/python/src/nanoarrow/iterator.py
+++ b/python/src/nanoarrow/iterator.py
@@ -17,7 +17,7 @@
 
 import warnings
 from functools import cached_property
-from itertools import islice, repeat
+from itertools import groupby, islice, repeat
 from typing import Iterable, Tuple
 
 from nanoarrow._array import CArrayView
@@ -297,6 +297,64 @@ class PyIterator(ArrayViewBaseIterator):
             for _ in range(length):
                 yield list(islice(child_iter, fixed_size))
 
+    def _sparse_union_iter(self, offset, length):
+        view = self._array_view
+        offset += view.offset
+
+        type_codes = self.schema.type_codes
+        child_index_by_type_id = {
+            member_id: i for i, member_id in enumerate(type_codes)
+        }
+
+        type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+
+        # Try as hard as we can to reduce the number of times we request a 
child
+        # iterator by iterating over runs of consecutive type_ids
+        i = 0
+        for item_type_id, item_type_id_iter in groupby(type_id):
+            type_id_run_length = len(list(item_type_id_iter))
+            child = self._children[child_index_by_type_id[item_type_id]]
+            yield from child._iter_chunk(i, type_id_run_length)
+
+            i += type_id_run_length
+
+    def _dense_union_iter(self, offset, length):
+        view = self._array_view
+        offset += view.offset
+
+        type_codes = self.schema.type_codes
+        child_index_by_type_id = {
+            member_id: i for i, member_id in enumerate(type_codes)
+        }
+
+        type_id = memoryview(view.buffer(0))[offset : (offset + length + 1)]
+        offsets = memoryview(view.buffer(1))[offset : (offset + length + 1)]
+
+        # Try as hard as we can to reduce the number of times we request a 
child
+        # iterator by iterating over runs of consecutive type_ids
+        i = 0
+        for item_type_id, item_type_id_iter in groupby(type_id):
+            type_id_run_length = len(list(item_type_id_iter))
+            child_offsets = offsets[i : (i + type_id_run_length)]
+            child_offset0 = child_offsets[0]
+
+            # This only works if there are no missing elements (i.e., for 
sequences
+            # of an identical type_id, the elements must be sequential and 
increasing).
+            # The spec specifies the sequential/increasing nature of these 
offsets but
+            # we check to be sure.
+            if (child_offsets[-1] - child_offset0) != (type_id_run_length - 1):
+                raise ValueError(
+                    f"Child offsets for type_id {item_type_id} are not 
sequential: "
+                    f"{list(child_offsets)} / {type_id_run_length}"
+                )
+
+            child_index = child_index_by_type_id[item_type_id]
+            yield from self._children[child_index]._iter_chunk(
+                child_offset0, type_id_run_length
+            )
+
+            i += type_id_run_length
+
     def _string_iter(self, offset, length):
         view = self._array_view
         offset += view.offset
@@ -568,6 +626,8 @@ _ITEMS_ITER_LOOKUP = {
     _types.LIST: "_list_iter",
     _types.LARGE_LIST: "_list_iter",
     _types.FIXED_SIZE_LIST: "_fixed_size_list_iter",
+    _types.SPARSE_UNION: "_sparse_union_iter",
+    _types.DENSE_UNION: "_dense_union_iter",
     _types.DICTIONARY: "_dictionary_iter",
     _types.DATE32: "_date_iter",
     _types.DATE64: "_date_iter",
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index 99616eee..c9d73679 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -18,7 +18,7 @@
 import enum
 import reprlib
 from functools import cached_property
-from typing import List, Mapping, Union
+from typing import List, Mapping, Optional, Union
 
 from nanoarrow._schema import (
     CArrowTimeUnit,
@@ -467,6 +467,19 @@ class Schema:
         else:
             return None
 
+    @property
+    def type_codes(self) -> Optional[List[int]]:
+        """Union type identifiers
+
+        >>> import nanoarrow as na
+        >>> na.dense_union([na.int32(), na.string()]).type_codes
+        [0, 1]
+        """
+        if self._c_schema_view.type_id in (_types.SPARSE_UNION, 
_types.DENSE_UNION):
+            return list(self._c_schema_view.union_type_ids)
+        else:
+            return None
+
     @property
     def n_fields(self) -> int:
         """Number of child Schemas
@@ -1290,6 +1303,71 @@ def dictionary(index_type, value_type, 
dictionary_ordered: bool = False) -> Sche
     )
 
 
+def sparse_union(
+    fields, type_codes: Optional[List[int]] = None, nullable: bool = True
+) -> Schema:
+    """Create a type where an element could be one of several pre-defined types
+
+    Parameters
+    ----------
+    fields :
+        * A dictionary whose keys are field names and values are schema-like 
objects
+        * An iterable whose items are a schema like objects where the field 
name is
+          inherited from the schema-like object.
+    type_codes : Specific numeric identifiers attached to each field (must be 
between
+        0 and 127, inclusive). When missing, these are generated as a sequence 
along
+        ``fields``.
+    nullable : bool, optional
+        Use ``False`` to mark this field as non-nullable.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> na.sparse_union([na.int32(), na.string()])
+    <Schema> sparse_union([0,1])<: int32, : string>
+    """
+    if type_codes is None:
+        type_codes = list(range(len(fields)))
+    return Schema(
+        Type.SPARSE_UNION, fields=fields, type_codes=type_codes, 
nullable=nullable
+    )
+
+
+def dense_union(
+    fields, type_codes: Optional[List[int]] = None, nullable: bool = True
+) -> Schema:
+    """Create a type where an element could be one of several pre-defined types
+
+    A dense union has a more compact (but more complex) representation than a
+    sparse union. Most Arrow unions in use are dense unions.
+
+    Parameters
+    ----------
+    fields :
+        * A dictionary whose keys are field names and values are schema-like 
objects
+        * An iterable whose items are a schema like objects where the field 
name is
+          inherited from the schema-like object.
+    type_codes : Specific numeric identifiers attached to each field (must be 
between
+        0 and 127, inclusive). When missing, these are generated as a sequence 
along
+        ``fields``.
+    nullable : bool, optional
+        Use ``False`` to mark this field as non-nullable.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> na.dense_union([na.int32(), na.string()])
+    <Schema> dense_union([0,1])<: int32, : string>
+    """
+    if type_codes is None:
+        type_codes = list(range(len(fields)))
+    return Schema(
+        Type.DENSE_UNION, fields=fields, type_codes=type_codes, 
nullable=nullable
+    )
+
+
 def extension_type(
     storage_schema,
     extension_name: str,
@@ -1387,6 +1465,16 @@ def _c_schema_from_type_and_params(type: Type, params: 
dict):
         if "dictionary_ordered" in params and 
bool(params.pop("dictionary_ordered")):
             factory.set_dictionary_ordered(True)
 
+    elif type == Type.SPARSE_UNION:
+        type_codes = params.pop("type_codes")
+        type_codes_str = ",".join(str(code) for code in type_codes)
+        factory.set_format(f"+us:{type_codes_str}")
+
+    elif type == Type.DENSE_UNION:
+        type_codes = params.pop("type_codes")
+        type_codes_str = ",".join(str(code) for code in type_codes)
+        factory.set_format(f"+ud:{type_codes_str}")
+
     else:
         factory.set_type(type.value)
 
@@ -1457,4 +1545,6 @@ _PARAM_NAMES = {
     _types.LARGE_LIST: ("value_type",),
     _types.FIXED_SIZE_LIST: ("value_type", "list_size"),
     _types.DICTIONARY: ("index_type", "value_type", "dictionary_ordered"),
+    _types.SPARSE_UNION: ("fields", "type_codes"),
+    _types.DENSE_UNION: ("fields", "type_codes"),
 }
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 0c0e0474..8e62027b 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -530,3 +530,77 @@ def test_iterator_extension():
 def test_iterator_null():
     array = na.c_array_from_buffers(na.null(), 3, [])
     assert list(iter_py(array)) == [None, None, None]
+
+
+def test_iterator_sparse_union():
+    children = [
+        na.c_array([True, None, None, None, None, False], na.bool_()),
+        na.c_array([None, 123, None, None, None, None], na.int32()),
+        na.c_array([None, None, 456.0, None, None, None], na.float64()),
+        na.c_array([None, None, None, "789", None, None], na.string()),
+    ]
+
+    # Check with the default sequential type codes
+    schema = na.sparse_union([na.bool_(), na.int32(), na.float64(), 
na.string()])
+    c_array = na.c_array_from_buffers(
+        schema,
+        length=6,
+        null_count=0,
+        buffers=[na.c_buffer([0, 1, 2, 3, 0, 0], na.int8())],
+        children=children,
+    )
+
+    assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
+
+    # Check with custom type codes
+    schema = na.sparse_union(
+        [na.bool_(), na.int32(), na.float64(), na.string()], type_codes=[3, 2, 
1, 0]
+    )
+    c_array = na.c_array_from_buffers(
+        schema,
+        length=6,
+        null_count=0,
+        buffers=[na.c_buffer([3, 2, 1, 0, 3, 3], na.int8())],
+        children=children,
+    )
+
+    assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
+
+
+def test_iterator_dense_union():
+    children = [
+        na.c_array([True, None, False], na.bool_()),
+        na.c_array([123], na.int32()),
+        na.c_array([456.0], na.float64()),
+        na.c_array(["789"], na.string()),
+    ]
+
+    schema = na.dense_union([na.bool_(), na.int32(), na.float64(), 
na.string()])
+    c_array = na.c_array_from_buffers(
+        schema,
+        length=6,
+        null_count=0,
+        buffers=[
+            na.c_buffer([0, 1, 2, 3, 0, 0], na.int8()),
+            na.c_buffer([0, 0, 0, 0, 1, 2], na.int32()),
+        ],
+        children=children,
+    )
+
+    assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
+
+    schema = na.dense_union(
+        [na.bool_(), na.int32(), na.float64(), na.string()], type_codes=[3, 2, 
1, 0]
+    )
+    c_array = na.c_array_from_buffers(
+        schema,
+        length=6,
+        null_count=0,
+        buffers=[
+            na.c_buffer([3, 2, 1, 0, 3, 3], na.int8()),
+            na.c_buffer([0, 0, 0, 0, 1, 2], na.int32()),
+        ],
+        children=children,
+    )
+
+    assert list(iter_py(c_array)) == [True, 123, 456.0, "789", None, False]
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index 7d1f97f3..1a6afc1c 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -222,6 +222,26 @@ def test_schema_dictionary():
     assert schema_obj_ordered.dictionary_ordered is True
 
 
+def test_schema_union():
+    schema_obj = na.dense_union([na.int32(), na.string()])
+    assert schema_obj.type == na.Type.DENSE_UNION
+    assert schema_obj.type_codes == [0, 1]
+
+    # Make sure custom type_codes come through
+    schema_obj = na.dense_union([na.int32(), na.string()], type_codes=[1, 0])
+    assert schema_obj.type == na.Type.DENSE_UNION
+    assert schema_obj.type_codes == [1, 0]
+
+    schema_obj = na.sparse_union([na.int32(), na.string()])
+    assert schema_obj.type == na.Type.SPARSE_UNION
+    assert schema_obj.type_codes == [0, 1]
+
+    # Make sure custom type_codes come through
+    schema_obj = na.sparse_union([na.int32(), na.string()], type_codes=[1, 0])
+    assert schema_obj.type == na.Type.SPARSE_UNION
+    assert schema_obj.type_codes == [1, 0]
+
+
 def test_schema_extension():
     schema_obj = na.int32()
     assert schema_obj.extension is None

Reply via email to