jorisvandenbossche commented on code in PR #464:
URL: https://github.com/apache/arrow-nanoarrow/pull/464#discussion_r1600302341
##########
python/src/nanoarrow/visitor.py:
##########
@@ -74,9 +75,60 @@ def to_columns(obj, schema=None) -> Tuple[List[str],
List[Sequence]]:
>>> names
['col1']
>>> columns
- [[1, 2, 3]]
+ [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3)]
"""
- return ColumnsBuilder.visit(obj, schema)
+ return ColumnsBuilder.visit(obj, schema, handle_nulls=handle_nulls)
+
+
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+ def handle(is_valid, data):
+ if len(is_valid) > 0:
+ raise ValueError("Null present with null_handler=nulls_forbid()")
+
+ return data
+
+ return handle
+
+
+def nulls_debug() -> Callable[[CBuffer, Sequence], Tuple[CBuffer, Sequence]]:
+ def handle(is_valid, data):
+ return is_valid, data
+
+ return handle
+
+
+def nulls_as_sentinel(sentinel=None):
+ from numpy import array, result_type
Review Comment:
```suggestion
import numpy as np
```
##########
python/src/nanoarrow/visitor.py:
##########
@@ -49,7 +50,7 @@ def to_pylist(obj, schema=None) -> List:
return ListBuilder.visit(obj, schema)
-def to_columns(obj, schema=None) -> Tuple[List[str], List[Sequence]]:
+def to_columns(obj, schema=None, handle_nulls=None) -> Tuple[List[str],
List[Sequence]]:
"""Convert ``obj`` to a ``list()` of sequences
Converts a stream of struct arrays into its column-wise representation
Review Comment:
For the line below, but can you then clarify when you get a buffer or when a
list?
##########
python/src/nanoarrow/visitor.py:
##########
@@ -74,9 +75,60 @@ def to_columns(obj, schema=None) -> Tuple[List[str],
List[Sequence]]:
>>> names
['col1']
>>> columns
- [[1, 2, 3]]
+ [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3)]
"""
- return ColumnsBuilder.visit(obj, schema)
+ return ColumnsBuilder.visit(obj, schema, handle_nulls=handle_nulls)
+
+
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+ def handle(is_valid, data):
+ if len(is_valid) > 0:
+ raise ValueError("Null present with null_handler=nulls_forbid()")
+
+ return data
+
+ return handle
+
+
+def nulls_debug() -> Callable[[CBuffer, Sequence], Tuple[CBuffer, Sequence]]:
+ def handle(is_valid, data):
+ return is_valid, data
+
+ return handle
+
+
+def nulls_as_sentinel(sentinel=None):
+ from numpy import array, result_type
+
+ def handle(is_valid, data):
+ is_valid = array(is_valid, copy=False)
+ data = array(data, copy=False)
+
+ if len(is_valid) > 0:
+ out_type = result_type(data, sentinel)
+ data = array(data, dtype=out_type, copy=True)
+ data[~is_valid] = sentinel
+ return data
+ else:
+ return data
+
+ return handle
+
+
+def nulls_as_masked_array():
Review Comment:
I wonder if it would be more useful to actually return each array as a tuple
of data and mask (the numpy masked array isn't used that much)
##########
python/src/nanoarrow/visitor.py:
##########
@@ -74,9 +75,60 @@ def to_columns(obj, schema=None) -> Tuple[List[str],
List[Sequence]]:
>>> names
['col1']
>>> columns
- [[1, 2, 3]]
+ [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3)]
"""
- return ColumnsBuilder.visit(obj, schema)
+ return ColumnsBuilder.visit(obj, schema, handle_nulls=handle_nulls)
+
+
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+ def handle(is_valid, data):
+ if len(is_valid) > 0:
+ raise ValueError("Null present with null_handler=nulls_forbid()")
+
+ return data
+
+ return handle
+
+
+def nulls_debug() -> Callable[[CBuffer, Sequence], Tuple[CBuffer, Sequence]]:
+ def handle(is_valid, data):
+ return is_valid, data
+
+ return handle
+
+
+def nulls_as_sentinel(sentinel=None):
+ from numpy import array, result_type
+
+ def handle(is_valid, data):
+ is_valid = array(is_valid, copy=False)
+ data = array(data, copy=False)
Review Comment:
```suggestion
is_valid = np.array(is_valid, copy=False)
data = np.array(data, copy=False)
```
Even though the import is just above, I was already confused about to what
kind of array we were converting this, and looking for a `from nanoarrow import
array` import at the top of the file ... So I think better to stick with
explicit module (alias), which is also the common way numpy is used in Python
##########
python/src/nanoarrow/visitor.py:
##########
@@ -74,9 +75,60 @@ def to_columns(obj, schema=None) -> Tuple[List[str],
List[Sequence]]:
>>> names
['col1']
>>> columns
- [[1, 2, 3]]
+ [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3)]
"""
- return ColumnsBuilder.visit(obj, schema)
+ return ColumnsBuilder.visit(obj, schema, handle_nulls=handle_nulls)
+
+
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+ def handle(is_valid, data):
+ if len(is_valid) > 0:
+ raise ValueError("Null present with null_handler=nulls_forbid()")
+
+ return data
+
+ return handle
+
+
+def nulls_debug() -> Callable[[CBuffer, Sequence], Tuple[CBuffer, Sequence]]:
+ def handle(is_valid, data):
+ return is_valid, data
+
+ return handle
+
+
+def nulls_as_sentinel(sentinel=None):
+ from numpy import array, result_type
+
+ def handle(is_valid, data):
+ is_valid = array(is_valid, copy=False)
+ data = array(data, copy=False)
Review Comment:
Also, i am not entirely sure the `copy=false` is correct here for
`is_valid`, or is that already unpacked at this point?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]