This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new fe5082f0 feat(python): Allow creation of dictionary and list types
(#445)
fe5082f0 is described below
commit fe5082f0c71d57d55770e0175fa40ecb6f1028c6
Author: Dewey Dunnington <[email protected]>
AuthorDate: Wed May 1 09:16:39 2024 -0300
feat(python): Allow creation of dictionary and list types (#445)
This PR adds support for creating dictionary and list types:
```python
import nanoarrow as na
na.list_of(na.int32())
#> Schema(LIST, value_type=Schema(INT32, name='item'))
na.dictionary(na.int32(), na.string())
#> Schema(DICTIONARY, index_type=Schema(INT32), value_type=Schema(STRING),
dictionary_ordered=False)
```
Before, creating these types (or associated arrays from buffer) was not
possible. This required some changes to `modify()` to ensure we could
also set `children` and `dictionary` there.
---
python/src/nanoarrow/__init__.py | 12 +-
python/src/nanoarrow/_lib.pyx | 80 ++++++++--
python/src/nanoarrow/schema.py | 289 +++++++++++++++++++++++++++++--------
python/tests/test_c_array.py | 2 +-
python/tests/test_c_buffer.py | 6 +-
python/tests/test_c_buffer_view.py | 4 +-
python/tests/test_c_schema.py | 42 ++++++
python/tests/test_iterator.py | 28 ++--
python/tests/test_schema.py | 48 ++++--
9 files changed, 405 insertions(+), 106 deletions(-)
diff --git a/python/src/nanoarrow/__init__.py b/python/src/nanoarrow/__init__.py
index c1cd12dd..d96ed5c2 100644
--- a/python/src/nanoarrow/__init__.py
+++ b/python/src/nanoarrow/__init__.py
@@ -43,7 +43,7 @@ from nanoarrow.schema import (
Type,
TimeUnit,
null,
- bool,
+ bool_,
int8,
uint8,
int16,
@@ -57,6 +57,10 @@ from nanoarrow.schema import (
float64,
string,
large_string,
+ list_,
+ large_list,
+ fixed_size_list,
+ dictionary,
binary,
large_binary,
fixed_size_binary,
@@ -88,7 +92,7 @@ __all__ = [
"allocate_c_array_stream",
"allocate_c_schema",
"binary",
- "bool",
+ "bool_",
"c_array",
"c_array_from_buffers",
"c_array_stream",
@@ -102,9 +106,11 @@ __all__ = [
"date64",
"decimal128",
"decimal256",
+ "dictionary",
"duration",
"extension_type",
"fixed_size_binary",
+ "fixed_size_list",
"float16",
"float32",
"float64",
@@ -117,6 +123,8 @@ __all__ = [
"interval_months",
"large_binary",
"large_string",
+ "large_list",
+ "list_",
"null",
"string",
"struct",
diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 99127087..04602120 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -769,23 +769,55 @@ cdef class CSchema:
else:
return None
- def modify(self, *, name=None, flags=None, nullable=None, metadata=None,
- validate=True):
- builder = CSchemaBuilder.copy(self)
+ def modify(self, *, format=None, name=None, flags=None, nullable=None,
+ metadata=None, children=None, dictionary=None, validate=True):
+ cdef CSchemaBuilder builder = CSchemaBuilder.allocate()
- if name is not None:
+ if format is None:
+ builder.set_format(self.format)
+ else:
+ builder.set_format(format)
+
+ if name is None:
+ builder.set_name(self.name)
+ elif name is not False:
builder.set_name(name)
- if flags is not None:
+ if flags is None:
+ builder.set_flags(self.flags)
+ else:
builder.set_flags(flags)
if nullable is not None:
builder.set_nullable(nullable)
- if metadata is not None:
- builder.clear_metadata()
+ if metadata is None:
+ if self.metadata is not None:
+ builder.append_metadata(self.metadata)
+ else:
builder.append_metadata(metadata)
+ if children is None:
+ if self.n_children > 0:
+ builder.allocate_children(self.n_children)
+ for i, child in enumerate(self.children):
+ builder.set_child(i, None, child)
+ elif hasattr(children, "items"):
+ builder.allocate_children(len(children))
+ for i, item in enumerate(children.items()):
+ name, child = item
+ builder.set_child(i, name, child)
+ else:
+ builder.allocate_children(len(children))
+ for i, child in enumerate(children):
+ builder.set_child(i, None, child)
+
+ if dictionary is None:
+ if self.dictionary:
+ builder.set_dictionary(self.dictionary)
+ elif dictionary is not False:
+ builder.set_dictionary(dictionary)
+
if validate:
builder.validate()
@@ -1036,19 +1068,10 @@ cdef class CSchemaBuilder:
if self._ptr.release == NULL:
ArrowSchemaInit(self._ptr)
- @staticmethod
- def copy(CSchema schema):
- return CSchemaBuilder(schema.__deepcopy__())
-
@staticmethod
def allocate():
return CSchemaBuilder(CSchema.allocate())
- def clear_metadata(self):
- cdef int code = ArrowSchemaSetMetadata(self.c_schema._ptr, NULL)
- Error.raise_error_not_ok("ArrowSchemaSetMetadata()", code)
- return self
-
def append_metadata(self, metadata):
cdef CBuffer buffer = CBuffer.empty()
@@ -1164,6 +1187,23 @@ cdef class CSchemaBuilder:
if name is not None:
name = str(name)
code = ArrowSchemaSetName(self._ptr.children[i],
name.encode("UTF-8"))
+ Error.raise_error_not_ok("ArrowSchemaSetName()", code)
+
+ return self
+
+ def set_dictionary(self, CSchema dictionary):
+ self.c_schema._assert_valid()
+
+ cdef int code
+ if self._ptr.dictionary == NULL:
+ code = ArrowSchemaAllocateDictionary(self._ptr)
+ Error.raise_error_not_ok("ArrowSchemaAllocateDictionary()", code)
+
+ if self._ptr.dictionary.release != NULL:
+ ArrowSchemaRelease(self._ptr.dictionary)
+
+ code = ArrowSchemaDeepCopy(dictionary._ptr, self._ptr.dictionary)
+ Error.raise_error_not_ok("ArrowSchemaDeepCopy()", code)
return self
@@ -1179,6 +1219,14 @@ cdef class CSchemaBuilder:
return self
+ def set_dictionary_ordered(self, dictionary_ordered):
+ if dictionary_ordered:
+ self._ptr.flags = self._ptr.flags | ARROW_FLAG_DICTIONARY_ORDERED
+ else:
+ self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_DICTIONARY_ORDERED
+
+ return self
+
def validate(self):
return CSchemaView(self.c_schema)
diff --git a/python/src/nanoarrow/schema.py b/python/src/nanoarrow/schema.py
index 97bb45b8..94d1a8c8 100644
--- a/python/src/nanoarrow/schema.py
+++ b/python/src/nanoarrow/schema.py
@@ -193,23 +193,27 @@ class Schema:
name=None,
nullable=None,
metadata=None,
+ fields=None,
**params,
) -> None:
if isinstance(obj, Type):
- self._c_schema = _c_schema_from_type_and_params(
- obj, params, name, nullable, metadata
- )
- self._c_schema_view = CSchemaView(self._c_schema)
- return
-
- if params:
- raise ValueError("params are only supported for obj of class Type")
-
- self._c_schema = c_schema(obj)
-
- if name is not None or nullable is not None or metadata is not None:
+ self._c_schema = _c_schema_from_type_and_params(obj, params)
+ else:
+ if params:
+ raise ValueError("params are only supported for obj of class
Type")
+ self._c_schema = c_schema(obj)
+
+ if (
+ name is not None
+ or nullable is not None
+ or metadata is not None
+ or fields is not None
+ ):
self._c_schema = self._c_schema.modify(
- name=name, nullable=nullable, metadata=metadata
+ name=name,
+ nullable=nullable,
+ metadata=metadata,
+ children=_clean_fields(fields),
)
self._c_schema_view = CSchemaView(self._c_schema)
@@ -343,6 +347,72 @@ class Schema:
return self._c_schema_view.decimal_scale
+ @property
+ def index_type(self) -> Union["Schema", None]:
+ """Dictionary index type
+
+ For dictionary types, the type corresponding to the indices.
+ See also :attr:`value_type`.
+
+ >>> import nanoarrow as na
+ >>> na.dictionary(na.int32(), na.string()).index_type
+ Schema(INT32)
+ """
+ if self._c_schema_view.type_id == CArrowType.DICTIONARY:
+ index_schema = self._c_schema.modify(
+ dictionary=False, flags=0, nullable=self.nullable
+ )
+ return Schema(index_schema)
+ else:
+ return None
+
+ @property
+ def dictionary_ordered(self) -> Union[bool, None]:
+ """Dictionary ordering
+
+ For dictionary types, returns ``True`` if the order of dictionary
values
+ are meaningful.
+
+ >>> import nanoarrow as na
+ >>> na.dictionary(na.int32(), na.string()).dictionary_ordered
+ False
+ """
+ return self._c_schema_view.dictionary_ordered
+
+ @property
+ def value_type(self):
+ """Dictionary or list value type
+
+ >>> import nanoarrow as na
+ >>> na.list_(na.int32()).value_type
+ Schema(INT32, name='item')
+ >>> na.dictionary(na.int32(), na.string()).value_type
+ Schema(STRING)
+ """
+ if self._c_schema_view.type_id in (
+ CArrowType.LIST,
+ CArrowType.LARGE_LIST,
+ CArrowType.FIXED_SIZE_LIST,
+ ):
+ return self.field(0)
+ elif self._c_schema_view.type_id == CArrowType.DICTIONARY:
+ return Schema(self._c_schema.dictionary)
+ else:
+ return None
+
+ @property
+ def list_size(self) -> Union[int, None]:
+ """Fixed-size list element size
+
+ >>> import nanoarrow as na
+ >>> na.fixed_size_list(na.int32(), 123).list_size
+ 123
+ """
+ if self._c_schema_view.type_id == CArrowType.FIXED_SIZE_LIST:
+ return self._c_schema_view.fixed_size
+ else:
+ return None
+
@property
def n_fields(self) -> int:
"""Number of child Schemas
@@ -408,7 +478,7 @@ def null(nullable: bool = True) -> Schema:
return Schema(Type.NULL, nullable=nullable)
-def bool(nullable: bool = True) -> Schema:
+def bool_(nullable: bool = True) -> Schema:
"""Create an instance of a boolean type.
Parameters
@@ -420,7 +490,7 @@ def bool(nullable: bool = True) -> Schema:
--------
>>> import nanoarrow as na
- >>> na.bool()
+ >>> na.bool_()
Schema(BOOL)
"""
return Schema(Type.BOOL, nullable=nullable)
@@ -945,9 +1015,8 @@ def struct(fields, nullable=True) -> Schema:
----------
fields :
* A dictionary whose keys are field names and values are schema-like
objects
- * An iterable whose items are a schema like object or a two-tuple of
the
- field name and a schema-like object. If a field name is not specified
- from the tuple, the field name is inherited from the schema-like
object.
+ * An iterable whose items are a schema like objects where the field
name is
+ inherited from the schema-like object.
nullable : bool, optional
Use ``False`` to mark this field as non-nullable.
@@ -957,14 +1026,113 @@ def struct(fields, nullable=True) -> Schema:
>>> import nanoarrow as na
>>> na.struct([na.int32()])
Schema(STRUCT, fields=[Schema(INT32)])
- >>> na.struct([("col1", na.int32())])
- Schema(STRUCT, fields=[Schema(INT32, name='col1')])
>>> na.struct({"col1": na.int32()})
Schema(STRUCT, fields=[Schema(INT32, name='col1')])
"""
return Schema(Type.STRUCT, fields=fields, nullable=nullable)
+def list_(value_type, nullable=True) -> Schema:
+ """Create a type representing a variable-size list of some other type.
+
+ Parameters
+ ----------
+ value_type : schema-like
+ The type of values in each list element.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.list_(na.int32())
+ Schema(LIST, value_type=Schema(INT32, name='item'))
+ """
+ return Schema(Type.LIST, value_type=value_type, nullable=nullable)
+
+
+def large_list(value_type, nullable=True) -> Schema:
+ """Create a type representing a variable-size list of some other type.
+
+ Unlike :func:`list_`, the func:`large_list` can accomodate arrays
+ with more than ``2 ** 31 - 1`` items in the values array.
+
+ Parameters
+ ----------
+ value_type : schema-like
+ The type of values in each list element.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.large_list(na.int32())
+ Schema(LARGE_LIST, value_type=Schema(INT32, name='item'))
+ """
+ return Schema(Type.LARGE_LIST, value_type=value_type, nullable=nullable)
+
+
+def fixed_size_list(value_type, list_size, nullable=True) -> Schema:
+ """Create a type representing a fixed-size list of some other type.
+
+ Parameters
+ ----------
+ value_type : schema-like
+ The type of values in each list element.
+ list_size : int
+ The number of values in each list element.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.fixed_size_list(na.int32(), 123)
+ Schema(FIXED_SIZE_LIST, value_type=Schema(INT32, name='item'),
list_size=123)
+ """
+ return Schema(
+ Type.FIXED_SIZE_LIST,
+ value_type=value_type,
+ list_size=list_size,
+ nullable=nullable,
+ )
+
+
+def dictionary(index_type, value_type, dictionary_ordered=False):
+ """Create a type representing dictionary-encoded values
+
+ Parameters
+ ----------
+ index_type : schema-like
+ The data type of the indices. Must be an integral type.
+ value_type : schema-like
+ The type of the dictionary array.
+ ordered: bool, optional
+ Use ``True`` if the order of values in the dictionary array is
+ meaningful.
+ nullable : bool, optional
+ Use ``False`` to mark this field as non-nullable.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> na.dictionary(na.int32(), na.string())
+ Schema(DICTIONARY, index_type=Schema(INT32), value_type=Schema(STRING), \
+dictionary_ordered=False)
+ """
+ return Schema(
+ Type.DICTIONARY,
+ index_type=index_type,
+ value_type=value_type,
+ dictionary_ordered=dictionary_ordered,
+ )
+
+
def extension_type(
storage_schema,
extension_name: str,
@@ -993,24 +1161,10 @@ def extension_type(
return Schema(storage_schema, nullable=nullable, metadata=metadata)
-def _c_schema_from_type_and_params(
- type: Type,
- params: dict,
- name: Union[bool, None, bool],
- nullable: Union[bool, None],
- metadata: Mapping[Union[str, bytes], Union[str, bytes]],
-):
+def _c_schema_from_type_and_params(type: Type, params: dict):
factory = CSchemaBuilder.allocate()
- if type == Type.STRUCT:
- fields = _clean_fields(params.pop("fields"))
-
- factory.set_format("+s").allocate_children(len(fields))
- for i, item in enumerate(fields):
- child_name, c_schema = item
- factory.set_child(i, child_name, c_schema)
-
- elif type.value in CSchemaView._decimal_types:
+ if type.value in CSchemaView._decimal_types:
precision = int(params.pop("precision"))
scale = int(params.pop("scale"))
factory.set_type_decimal(type.value, precision, scale)
@@ -1029,6 +1183,32 @@ def _c_schema_from_type_and_params(
elif type == Type.FIXED_SIZE_BINARY:
factory.set_type_fixed_size(type.value, int(params.pop("byte_width")))
+ elif type == Type.LIST:
+ factory.set_format("+l")
+ factory.allocate_children(1)
+ factory.set_child(0, "item", c_schema(params.pop("value_type")))
+
+ elif type == Type.LARGE_LIST:
+ factory.set_format("+L")
+ factory.allocate_children(1)
+ factory.set_child(0, "item", c_schema(params.pop("value_type")))
+
+ elif type == Type.FIXED_SIZE_LIST:
+ fixed_size = int(params.pop("list_size"))
+ factory.set_format(f"+w:{fixed_size}")
+ factory.allocate_children(1)
+ factory.set_child(0, "item", c_schema(params.pop("value_type")))
+
+ elif type == Type.DICTIONARY:
+ index_type = c_schema(params.pop("index_type"))
+ factory.set_format(index_type.format)
+
+ value_type = c_schema(params.pop("value_type"))
+ factory.set_dictionary(value_type)
+
+ if "dictionary_ordered" in params and
bool(params.pop("dictionary_ordered")):
+ factory.set_dictionary_ordered(True)
+
else:
factory.set_type(type.value)
@@ -1036,38 +1216,19 @@ def _c_schema_from_type_and_params(
unused = ", ".join(f"'{item}'" for item in params.keys())
raise ValueError(f"Unused parameters whilst constructing Schema:
{unused}")
- # Apply default nullability (True)
- if nullable is None:
- nullable = True
- factory.set_nullable(nullable)
-
- # Apply default name (an empty string). To explicitly set a NULL
- # name, a caller would have to specify False.
- if name is None:
- name = ""
- elif name is False:
- name = None
- factory.set_name(name)
-
- # Apply metadata
- if metadata is not None:
- factory.append_metadata(metadata)
+ # Better default than NULL, which causes some implementations to crash
+ factory.set_name("")
return factory.finish()
def _clean_fields(fields):
- if isinstance(fields, dict):
- return [(str(k), c_schema(v)) for k, v in fields.items()]
+ if fields is None:
+ return None
+ elif hasattr(fields, "items"):
+ return {k: c_schema(v) for k, v in fields.items()}
else:
- fields_clean = []
- for item in fields:
- if isinstance(item, tuple) and len(item) == 2:
- fields_clean.append((str(item[0]), c_schema(item[1])))
- else:
- fields_clean.append((None, c_schema(item)))
-
- return fields_clean
+ return [c_schema(v) for v in fields]
def _schema_repr(obj):
@@ -1120,4 +1281,8 @@ _PARAM_NAMES = {
CArrowType.DECIMAL128: ("precision", "scale"),
CArrowType.DECIMAL256: ("precision", "scale"),
CArrowType.STRUCT: ("fields",),
+ CArrowType.LIST: ("value_type",),
+ CArrowType.LARGE_LIST: ("value_type",),
+ CArrowType.FIXED_SIZE_LIST: ("value_type", "list_size"),
+ CArrowType.DICTIONARY: ("index_type", "value_type", "dictionary_ordered"),
}
diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
index b64370da..b5ec3b90 100644
--- a/python/tests/test_c_array.py
+++ b/python/tests/test_c_array.py
@@ -303,7 +303,7 @@ def test_c_array_from_iterable_float_with_nulls():
def test_c_array_from_iterable_bool_with_nulls():
- c_array = na.c_array([True, None, False], na.bool())
+ c_array = na.c_array([True, None, False], na.bool_())
assert c_array.length == 3
assert c_array.null_count == 1
diff --git a/python/tests/test_c_buffer.py b/python/tests/test_c_buffer.py
index 40ba4a3b..d18435cc 100644
--- a/python/tests/test_c_buffer.py
+++ b/python/tests/test_c_buffer.py
@@ -322,7 +322,7 @@ def test_c_buffer_from_decimal256_iterable():
def test_c_buffer_bitmap_from_iterable():
# Check something less than one byte
- buffer = na.c_buffer([True, False, False, True], na.bool())
+ buffer = na.c_buffer([True, False, False, True], na.bool_())
assert "10010000" in repr(buffer)
assert buffer.size_bytes == 1
assert buffer.data_type == "bool"
@@ -343,13 +343,13 @@ def test_c_buffer_bitmap_from_iterable():
)
# Check something exactly one byte
- buffer = na.c_buffer([True, False, False, True] * 2, na.bool())
+ buffer = na.c_buffer([True, False, False, True] * 2, na.bool_())
assert "10011001" in repr(buffer)
assert buffer.size_bytes == 1
assert list(buffer.elements()) == [True, False, False, True] * 2
# Check something more than one byte
- buffer = na.c_buffer([True, False, False, True] * 3, na.bool())
+ buffer = na.c_buffer([True, False, False, True] * 3, na.bool_())
assert "1001100110010000" in repr(buffer)
assert buffer.size_bytes == 2
assert list(buffer.elements()) == [True, False, False, True] * 3 + [
diff --git a/python/tests/test_c_buffer_view.py
b/python/tests/test_c_buffer_view.py
index b885488e..e84c04df 100644
--- a/python/tests/test_c_buffer_view.py
+++ b/python/tests/test_c_buffer_view.py
@@ -20,8 +20,8 @@ import pytest
import nanoarrow as na
-def test_buffer_view_bool():
- bool_array_view = na.c_array_view([1, 0, 0, 1], na.bool())
+def test_buffer_view_bool_():
+ bool_array_view = na.c_array_view([1, 0, 0, 1], na.bool_())
view = bool_array_view.buffer(1)
assert view.element_size_bits == 1
diff --git a/python/tests/test_c_schema.py b/python/tests/test_c_schema.py
index 5617fe7b..e299157f 100644
--- a/python/tests/test_c_schema.py
+++ b/python/tests/test_c_schema.py
@@ -133,6 +133,9 @@ def test_c_schema_modify():
assert schema_clone is not schema
assert schema._addr() != schema_clone._addr()
+ schema_formatted = schema.modify(format="i")
+ assert schema_formatted.format == "i"
+
schema_named = schema.modify(name="something else")
assert schema_named.name == "something else"
assert schema_named.format == schema.format
@@ -155,3 +158,42 @@ def test_c_schema_modify():
schema_no_metad = schema_metad.modify(metadata={})
assert schema_no_metad.metadata is None
+
+
+def test_c_schema_modify_children():
+ schema = na.c_schema(na.struct({"col1": na.null()}))
+
+ schema_same_children = schema.modify()
+ assert schema_same_children.n_children == 1
+ assert schema_same_children.child(0).name == "col1"
+ assert schema_same_children.child(0).format == "n"
+
+ schema_new_children_list = schema.modify(
+ children=[na.c_schema(na.int32()).modify(name="new name")]
+ )
+ assert schema_new_children_list.n_children == 1
+ assert schema_new_children_list.child(0).name == "new name"
+ assert schema_new_children_list.child(0).format == "i"
+
+ schema_new_children_dict = schema.modify(
+ children={"new name": na.c_schema(na.int32())}
+ )
+ assert schema_new_children_dict.n_children == 1
+ assert schema_new_children_dict.child(0).name == "new name"
+ assert schema_new_children_dict.child(0).format == "i"
+
+
+def test_c_schema_modify_dictionary():
+ schema = na.c_schema(na.int32())
+
+ schema_dictionary = schema.modify(dictionary=na.c_schema(na.string()))
+ assert schema_dictionary.format == "i"
+ assert schema_dictionary.dictionary.format == "u"
+
+ schema_same_dictionary = schema_dictionary.modify()
+ assert schema_same_dictionary.format == "i"
+ assert schema_same_dictionary.dictionary.format == "u"
+
+ schema_no_dictionary = schema_dictionary.modify(dictionary=False)
+ assert schema_no_dictionary.format == "i"
+ assert schema.dictionary is None
diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
index 826f9be2..ff0b34e2 100644
--- a/python/tests/test_iterator.py
+++ b/python/tests/test_iterator.py
@@ -106,10 +106,13 @@ def test_iterator_nullable_binary():
def test_iter_tuples():
array = na.c_array_from_buffers(
- na.struct({"col1": na.int32(), "col2": na.bool()}),
+ na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=3,
buffers=[None],
- children=[na.c_array([1, 2, 3], na.int32()), na.c_array([1, 0, 1],
na.bool())],
+ children=[
+ na.c_array([1, 2, 3], na.int32()),
+ na.c_array([1, 0, 1], na.bool_()),
+ ],
)
assert list(iter_tuples(array)) == [(1, True), (2, False), (3, True)]
@@ -131,12 +134,12 @@ def test_iter_tuples():
def test_iter_tuples_nullable():
array = na.c_array_from_buffers(
- na.struct({"col1": na.int32(), "col2": na.bool()}),
+ na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=4,
- buffers=[na.c_buffer([True, True, True, False], na.bool())],
+ buffers=[na.c_buffer([True, True, True, False], na.bool_())],
children=[
na.c_array([1, 2, 3, 4], na.int32()),
- na.c_array([1, 0, 1, 0], na.bool()),
+ na.c_array([1, 0, 1, 0], na.bool_()),
],
)
@@ -148,7 +151,7 @@ def test_iter_tuples_nullable():
sliced_child = na.c_array_from_buffers(
array.schema,
length=3,
- buffers=[na.c_buffer([True, True, False], na.bool())],
+ buffers=[na.c_buffer([True, True, False], na.bool_())],
children=[array.child(0)[1:], array.child(1)[1:]],
)
assert list(iter_tuples(sliced_child)) == [(2, False), (3, True), None]
@@ -164,10 +167,13 @@ def test_iter_tuples_errors():
def test_iterator_struct():
array = na.c_array_from_buffers(
- na.struct({"col1": na.int32(), "col2": na.bool()}),
+ na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=3,
buffers=[None],
- children=[na.c_array([1, 2, 3], na.int32()), na.c_array([1, 0, 1],
na.bool())],
+ children=[
+ na.c_array([1, 2, 3], na.int32()),
+ na.c_array([1, 0, 1], na.bool_()),
+ ],
)
assert list(iter_py(array)) == [
@@ -185,12 +191,12 @@ def test_iterator_struct():
def test_iterator_nullable_struct():
array = na.c_array_from_buffers(
- na.struct({"col1": na.int32(), "col2": na.bool()}),
+ na.struct({"col1": na.int32(), "col2": na.bool_()}),
length=4,
- buffers=[na.c_buffer([True, True, True, False], na.bool())],
+ buffers=[na.c_buffer([True, True, True, False], na.bool_())],
children=[
na.c_array([1, 2, 3, 4], na.int32()),
- na.c_array([1, 0, 1, 0], na.bool()),
+ na.c_array([1, 0, 1, 0], na.bool_()),
],
)
diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py
index cc9e42b2..38c412f3 100644
--- a/python/tests/test_schema.py
+++ b/python/tests/test_schema.py
@@ -77,13 +77,17 @@ def test_schema_create_no_params():
assert schema_obj.name == "not empty"
assert "name='not empty'" in repr(schema_obj)
+ msg = "params are only supported for obj of class Type"
+ with pytest.raises(ValueError, match=msg):
+ na.Schema(na.fixed_size_binary(123), byte_width=12)
+
with pytest.raises(ValueError, match=r"^Unused parameter"):
na.Schema(na.Type.INT32, unused_param="unused_value")
def test_schema_simple():
assert na.null().type == na.Type.NULL
- assert na.bool().type == na.Type.BOOL
+ assert na.bool_().type == na.Type.BOOL
assert na.int8().type == na.Type.INT8
assert na.uint8().type == na.Type.UINT8
assert na.int16().type == na.Type.INT16
@@ -171,13 +175,6 @@ def test_schema_struct():
assert "fields=[Schema(INT32)]" in repr(schema_obj)
- # Make sure we can use a list of two-tuples
- schema_obj = na.struct([("col_name", na.Type.INT32)])
- assert schema_obj.type == na.Type.STRUCT
- assert schema_obj.field(0).type == na.Type.INT32
- assert schema_obj.field(0).name == "col_name"
- assert "fields=[Schema(INT32, name='col_name')]" in repr(schema_obj)
-
# Make sure we can use a dictionary to specify fields
schema_obj = na.struct({"col_name": na.Type.INT32})
assert schema_obj.type == na.Type.STRUCT
@@ -185,13 +182,46 @@ def test_schema_struct():
assert schema_obj.field(0).name == "col_name"
# Make sure we can use a Schema when constructing fields (and that
- # fild names are taken from the input)
+ # field names are taken from the input)
schema_obj = na.struct([schema_obj.field(0)])
assert schema_obj.type == na.Type.STRUCT
assert schema_obj.field(0).type == na.Type.INT32
assert schema_obj.field(0).name == "col_name"
+def test_schema_list_():
+ schema_obj = na.list_(na.null())
+ assert schema_obj.type == na.Type.LIST
+ assert schema_obj.value_type.type == na.Type.NULL
+
+
+def test_schema_large_list():
+ schema_obj = na.large_list(na.null())
+ assert schema_obj.type == na.Type.LARGE_LIST
+ assert schema_obj.value_type.type == na.Type.NULL
+
+
+def test_schema_fixed_size_list():
+ schema_obj = na.fixed_size_list(na.null(), 123)
+ assert schema_obj.type == na.Type.FIXED_SIZE_LIST
+ assert schema_obj.value_type.type == na.Type.NULL
+ assert schema_obj.list_size == 123
+
+
+def test_schema_dictionary():
+ schema_obj = na.dictionary(na.int8(), na.null())
+ assert schema_obj.type == na.Type.DICTIONARY
+ assert schema_obj.index_type.type == na.Type.INT8
+ assert schema_obj.value_type.type == na.Type.NULL
+ assert schema_obj.dictionary_ordered is False
+
+ schema_obj_ordered = na.dictionary(na.int8(), na.null(),
dictionary_ordered=True)
+ assert schema_obj_ordered.type == na.Type.DICTIONARY
+ assert schema_obj_ordered.index_type.type == na.Type.INT8
+ assert schema_obj_ordered.value_type.type == na.Type.NULL
+ assert schema_obj_ordered.dictionary_ordered is True
+
+
def test_schema_extension():
schema_obj = na.int32()
assert schema_obj.extension is None