This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 253b7ec0 feat(python): Add map type constructor (#687)
253b7ec0 is described below
commit 253b7ec0cb3cc4a5fb15c90c3879c2f2785d5600
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed Nov 20 04:15:36 2024 +0000
feat(python): Add map type constructor (#687)
When working on fixing a problem with Arrow C++'s map type import, I
realized there was no way to create map types or any of the canonical
extensions. Extensions are slightly different and I'll tackle them
later, but map types are relatively straightforward and follow the
existing pattern of all the other constructors:
```python
import nanoarrow as na
na.map_(na.string(), na.int32())
#> <Schema> map<entries: struct<key: string, value: int32>>
```
---
python/src/nanoarrow/__init__.py | 2 +
python/src/nanoarrow/_schema.pyx | 8 ++++
python/src/nanoarrow/schema.py | 98 +++++++++++++++++++++++++++++++++++-----
python/tests/test_schema.py | 10 ++++
4 files changed, 107 insertions(+), 11 deletions(-)
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index 1a4b898b..7f67dd30 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -52,6 +52,7 @@ from nanoarrow.schema import (
list_,
large_list,
fixed_size_list,
+ map_,
dictionary,
binary,
large_binary,
@@ -115,6 +116,7 @@ __all__ = [
"large_string",
"large_list",
"list_",
+ "map_",
"null",
"nulls_as_sentinel",
"nulls_forbid",
diff --git a/python/src/nanoarrow/_schema.pyx b/python/src/nanoarrow/_schema.pyx
index 2717c529..3e82c065 100644
--- a/python/src/nanoarrow/_schema.pyx
+++ b/python/src/nanoarrow/_schema.pyx
@@ -878,6 +878,14 @@ cdef class CSchemaBuilder:
return self
+ def set_map_keys_sorted(self, map_keys_sorted) -> CSchemaBuilder:
+ if map_keys_sorted:
+ self._ptr.flags = self._ptr.flags | ARROW_FLAG_MAP_KEYS_SORTED
+ else:
+ self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_MAP_KEYS_SORTED
+
+ return self
+
def validate(self) -> CSchemaView:
return CSchemaView(self.c_schema)
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index 2d24bb99..67412e99 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -401,12 +401,14 @@ class Schema:
return self._c_schema_view.dictionary_ordered
@property
- def value_type(self):
- """Dictionary or list value type
+ def value_type(self) -> Union["Schema", None]:
+ """Dictionary, map, or list value type
>>> import nanoarrow as na
>>> na.list_(na.int32()).value_type
<Schema> 'item': int32
+ >>> na.map_(na.int32(), na.string()).value_type
+ <Schema> 'value': string
>>> na.dictionary(na.int32(), na.string()).value_type
<Schema> string
"""
@@ -416,11 +418,33 @@ class Schema:
_types.FIXED_SIZE_LIST,
):
return self.field(0)
+ elif self._c_schema_view.type_id == _types.MAP:
+ return Schema(self._c_schema.child(0).child(1))
elif self._c_schema_view.type_id == _types.DICTIONARY:
return Schema(self._c_schema.dictionary)
else:
return None
+ @property
+ def key_type(self) -> Union["Schema", None]:
+ """Map key type
+
+ >>> import nanoarrow as na
+ >>> na.map_(na.int32(), na.string()).key_type
+ <Schema> 'key': non-nullable int32
+ """
+ if self._c_schema_view.type_id == _types.MAP:
+ return Schema(self._c_schema.child(0).child(0))
+ else:
+ return None
+
+ @property
+ def keys_sorted(self) -> Union[bool, None]:
+ if self._c_schema_view.type_id == _types.MAP:
+ return self._c_schema_view.map_keys_sorted
+ else:
+ return None
+
@property
def list_size(self) -> Union[int, None]:
"""Fixed-size list element size
@@ -979,7 +1003,7 @@ def timestamp(
return Schema(Type.TIMESTAMP, timezone=timezone, unit=unit,
nullable=nullable)
-def duration(unit, nullable: bool = True):
+def duration(unit, nullable: bool = True) -> Schema:
"""Create an instance of a duration type.
Parameters
@@ -999,7 +1023,7 @@ def duration(unit, nullable: bool = True):
return Schema(Type.DURATION, unit=unit, nullable=nullable)
-def interval_months(nullable: bool = True):
+def interval_months(nullable: bool = True) -> Schema:
"""Create an instance of an interval type measured in months.
Parameters
@@ -1017,7 +1041,7 @@ def interval_months(nullable: bool = True):
return Schema(Type.INTERVAL_MONTHS, nullable=nullable)
-def interval_day_time(nullable: bool = True):
+def interval_day_time(nullable: bool = True) -> Schema:
"""Create an instance of an interval type measured as a day/time pair.
Parameters
@@ -1035,7 +1059,7 @@ def interval_day_time(nullable: bool = True):
return Schema(Type.INTERVAL_DAY_TIME, nullable=nullable)
-def interval_month_day_nano(nullable: bool = True):
+def interval_month_day_nano(nullable: bool = True) -> Schema:
"""Create an instance of an interval type measured as a
month/day/nanosecond
tuple.
@@ -1100,7 +1124,7 @@ def decimal256(precision: int, scale: int, nullable: bool
= True) -> Schema:
return Schema(Type.DECIMAL256, precision=precision, scale=scale,
nullable=nullable)
-def struct(fields, nullable=True) -> Schema:
+def struct(fields, nullable: bool = True) -> Schema:
"""Create a type representing a named sequence of fields.
Parameters
@@ -1124,7 +1148,7 @@ def struct(fields, nullable=True) -> Schema:
return Schema(Type.STRUCT, fields=fields, nullable=nullable)
-def list_(value_type, nullable=True) -> Schema:
+def list_(value_type, nullable: bool = True) -> Schema:
"""Create a type representing a variable-size list of some other type.
Parameters
@@ -1144,7 +1168,7 @@ def list_(value_type, nullable=True) -> Schema:
return Schema(Type.LIST, value_type=value_type, nullable=nullable)
-def large_list(value_type, nullable=True) -> Schema:
+def large_list(value_type, nullable: bool = True) -> Schema:
"""Create a type representing a variable-size list of some other type.
Unlike :func:`list_`, the func:`large_list` can accomodate arrays
@@ -1167,7 +1191,7 @@ def large_list(value_type, nullable=True) -> Schema:
return Schema(Type.LARGE_LIST, value_type=value_type, nullable=nullable)
-def fixed_size_list(value_type, list_size, nullable=True) -> Schema:
+def fixed_size_list(value_type, list_size: int, nullable: bool = True) ->
Schema:
"""Create a type representing a fixed-size list of some other type.
Parameters
@@ -1194,7 +1218,40 @@ def fixed_size_list(value_type, list_size,
nullable=True) -> Schema:
)
-def dictionary(index_type, value_type, dictionary_ordered=False):
+def map_(key_type, value_type, keys_sorted: bool = False, nullable: bool =
True):
+ """Create a type representing a list of key/value mappings
+
+ Note that each element in the list contains potentially many
+ key/value pairs (and that a map array contains potentially
+ many individual mappings).
+
+ Parameters
+ ----------
+ value_type : schema-like
+ The type of keys in each map element.
+ value_type : schema-like
+ The type of values in each map element
+ keys_sorted : bool, optional
+ True if keys within each map element are sorted.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+ >>> import nanoarrow as na
+ >>> na.map_(na.int32(), na.string())
+ <Schema> map<entries: struct<key: int32, value: string>>
+ """
+ return Schema(
+ Type.MAP,
+ key_type=key_type,
+ value_type=value_type,
+ keys_sorted=keys_sorted,
+ nullable=nullable,
+ )
+
+
+def dictionary(index_type, value_type, dictionary_ordered: bool = False) ->
Schema:
"""Create a type representing dictionary-encoded values
Parameters
@@ -1290,6 +1347,25 @@ def _c_schema_from_type_and_params(type: Type, params:
dict):
factory.allocate_children(1)
factory.set_child(0, "item", c_schema(params.pop("value_type")))
+ elif type == Type.MAP:
+ key_schema = c_schema(params.pop("key_type"))
+ value_schema = c_schema(params.pop("value_type"))
+
+ entries = CSchemaBuilder.allocate()
+ entries.set_format("+s")
+ entries.set_nullable(False)
+ entries.allocate_children(2)
+ entries.set_child(0, "key", key_schema.modify(nullable=False))
+ entries.set_child(1, "value", value_schema)
+
+ factory.set_format("+m")
+ factory.allocate_children(1)
+ factory.set_child(0, "entries", entries.finish())
+ factory.set_nullable(False)
+
+ if "keys_sorted" in params:
+ factory.set_map_keys_sorted(params.pop("keys_sorted"))
+
elif type == Type.DICTIONARY:
index_type = c_schema(params.pop("index_type"))
factory.set_format(index_type.format)
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index 770afa4c..77a52d3a 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -198,6 +198,16 @@ def test_schema_fixed_size_list():
assert schema_obj.list_size == 123
+def test_schema_map():
+ schema_obj = na.map_(na.int32(), na.string())
+ assert schema_obj.type == na.Type.MAP
+ assert schema_obj.key_type.type == na.Type.INT32
+ assert schema_obj.value_type.type == na.Type.STRING
+ assert schema_obj.keys_sorted is False
+
+ assert na.map_(na.int32(), na.string(), keys_sorted=True).keys_sorted is
True
+
+
def test_schema_dictionary():
schema_obj = na.dictionary(na.int8(), na.null())
assert schema_obj.type == na.Type.DICTIONARY