paleolimbot commented on code in PR #464:
URL: https://github.com/apache/arrow-nanoarrow/pull/464#discussion_r1603390422
##########
python/src/nanoarrow/visitor.py:
##########
@@ -15,68 +15,187 @@
# specific language governing permissions and limitations
# under the License.
-from typing import Any, List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Sequence, Tuple, Union
-from nanoarrow._lib import CArrayView
+from nanoarrow._lib import CArrayView, CArrowType, CBuffer, CBufferBuilder
from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema_view
from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
from nanoarrow.schema import Type
-def to_pylist(obj, schema=None) -> List:
- """Convert ``obj`` to a ``list()` of Python objects
+class ArrayViewVisitable:
+ """Mixin class providing conversion methods based on visitors
+
+ Can be used with classes that implement ``__arrow_c_stream__()``
+ or ``__arrow_c_array__()``.
+ """
+
+ def to_pylist(self) -> List:
+ """Convert to a ``list()`` of Python objects
+
+ Computes an identical value to ``list(iter_py())`` but can be much
+ faster.
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> from nanoarrow import visitor
+ >>> array = na.Array([1, 2, 3], na.int32())
+ >>> array.to_pylist()
+ [1, 2, 3]
+ """
+ return ListBuilder.visit(self)
+
+ def to_column_list(self, handle_nulls=None) -> Tuple[List[str],
List[Sequence]]:
+ """Convert to a ``list()` of contiguous sequences
+
+ Converts a stream of struct arrays into its column-wise representation
+ according to :meth:`to_column`.
+
+ Paramters
+ ---------
+ handle_nulls : callable
+ A function returning a sequence based on a validity bytemap and a
+ contiguous buffer of values (e.g., the callable returned by
+ :meth:`nulls_as_sentinel`).
+
+ Examples
+ --------
+
+ >>> import nanoarrow as na
+ >>> import pyarrow as pa
+ >>> batch = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
+ >>> names, columns = na.Array(batch).to_column_list()
+ >>> names
+ ['col1']
+ >>> columns
+ [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3)]
+ """
+ return ColumnsBuilder.visit(self, handle_nulls=handle_nulls)
+
+ def to_column(self, handle_nulls=None) -> Sequence:
+ """Convert to a contiguous sequence
+
+ Converts a stream of arrays into a columnar representation
+ such that each column is either a contiguous buffer or a ``list()``.
+ Integer, float, and interval arrays are currently converted to their
+ contiguous buffer representation; other types are returned as a list
+ of Python objects. The sequences returned by :meth:`to_column` are
+ designed to work as input to ``pandas.Series`` and/or
``numpy.array()``.
+
+ Parameters
+ ---------
+ obj : array stream-like
+ An array-like or array stream-like object as sanitized by
+ :func:`c_array_stream`.
+ schema : schema-like, optional
+ An optional schema, passed to :func:`c_array_stream`.
+ handle_nulls : callable
+ A function returning a sequence based on a validity bytemap and a
+ contiguous buffer of values (e.g., the callable returned by
+ :meth:`nulls_as_sentinel`).
+
+ Examples
+ --------
+ >>> import nanoarrow as na
+ >>> na.Array([1, 2, 3], na.int32()).to_column()
+ nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+ """
+ return SingleColumnBuilder.visit(self, handle_nulls=handle_nulls)
- Computes an identical value to ``list(iterator.iter_py())`` but is several
- times faster.
- Paramters
- ---------
- obj : array stream-like
- An array-like or array stream-like object as sanitized by
- :func:`c_array_stream`.
- schema : schema-like, optional
- An optional schema, passed to :func:`c_array_stream`.
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+ """Erroring null handler
+
+ A null handler that errors when it encounters nulls.
Examples
--------
>>> import nanoarrow as na
- >>> from nanoarrow import visitor
- >>> array = na.c_array([1, 2, 3], na.int32())
- >>> visitor.to_pylist(array)
- [1, 2, 3]
+ >>> na.Array([1, 2, 3], na.int32()).to_column(na.nulls_forbid())
+ nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+ >>> na.Array([1, None, 3], na.int32()).to_column(na.nulls_forbid())
+ Traceback (most recent call last):
+ ...
+ ValueError: Null present with null_handler=nulls_forbid()
"""
- return ListBuilder.visit(obj, schema)
+ def handle(is_valid, data):
+ if len(is_valid) > 0:
+ raise ValueError("Null present with null_handler=nulls_forbid()")
+
+ return data
-def to_columns(obj, schema=None) -> Tuple[List[str], List[Sequence]]:
- """Convert ``obj`` to a ``list()` of sequences
+ return handle
- Converts a stream of struct arrays into its column-wise representation
- such that each column is either a contiguous buffer or a ``list()``.
- Paramters
- ---------
- obj : array stream-like
- An array-like or array stream-like object as sanitized by
- :func:`c_array_stream`.
- schema : schema-like, optional
- An optional schema, passed to :func:`c_array_stream`.
+def nulls_as_sentinel(sentinel=None):
+ """Sentinel null handler
+
+ A null handler that assigns a sentinel to null values. This is
+ done using numpy using the expression ``data[~is_valid] = sentinel``.
+ The default sentinel value will result in ``nan`` assigned to null
+ values in numeric and boolean outputs.
+
+ Parameters
+ ----------
+ sentinel : scalar, optional
+ The value with which nulls should be replaced.
Examples
--------
>>> import nanoarrow as na
+ >>> na.Array([1, 2, 3], na.int32()).to_column(na.nulls_as_sentinel())
+ array([1, 2, 3], dtype=int32)
+ >>> na.Array([1, None, 3], na.int32()).to_column(na.nulls_as_sentinel())
+ array([ 1., nan, 3.])
+ >>> na.Array([1, None, 3],
na.int32()).to_column(na.nulls_as_sentinel(-999))
+ array([ 1, -999, 3], dtype=int32)
+ """
+ import numpy as np
+
+ def handle(is_valid, data):
+ is_valid = np.array(is_valid, copy=False)
+ data = np.array(data, copy=False)
+
+ if len(is_valid) > 0:
+ out_type = np.result_type(data, sentinel)
+ data = np.array(data, dtype=out_type, copy=True)
+ data[~is_valid] = sentinel
+ return data
+ else:
+ return data
+
+ return handle
+
+
+def nulls_debug() -> Callable[[CBuffer, Sequence], Tuple[CBuffer, Sequence]]:
Review Comment:
I changed the name to `nulls_separate()` and exported it...from
@jorisvandenbossche's comment it sounds like it would be useful (in the
situation where caller wants to handle nulls completely on their own).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]