jorisvandenbossche commented on code in PR #464:
URL: https://github.com/apache/arrow-nanoarrow/pull/464#discussion_r1603568090


##########
python/src/nanoarrow/visitor.py:
##########
@@ -15,68 +15,185 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Sequence, Tuple, Union
 
-from nanoarrow._lib import CArrayView
+from nanoarrow._lib import CArrayView, CArrowType, CBuffer, CBufferBuilder
 from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema_view
 from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
 from nanoarrow.schema import Type
 
 
-def to_pylist(obj, schema=None) -> List:
-    """Convert ``obj`` to a ``list()` of Python objects
+class ArrayViewVisitable:
+    """Mixin class providing conversion methods based on visitors
 
-    Computes an identical value to ``list(iterator.iter_py())`` but is several
-    times faster.
+    Can be used with classes that implement ``__arrow_c_stream__()``
+    or ``__arrow_c_array__()``.
+    """
+
+    def to_pylist(self) -> List:
+        """Convert to a ``list`` of Python objects
+
+        Computes an identical value to ``list(iter_py())`` but can be much
+        faster.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> from nanoarrow import visitor
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.to_pylist()
+        [1, 2, 3]
+        """
+        return ListBuilder.visit(self)
+
+    def to_column_list(self, handle_nulls=None) -> Tuple[List[str], 
List[Sequence]]:
+        """Convert to a ``list`` of contiguous sequences
+
+        Converts a stream of struct arrays into its column-wise representation
+        according to :meth:`to_column`.
+
+        Paramters
+        ---------
+        handle_nulls : callable
+            A function returning a sequence based on a validity bytemap and a
+            contiguous buffer of values. If the array contains no nulls, the
+            validity bytemap will be ``None``. Built-in handlers include
+            :func:`nulls_as_sentinel`, :func:`nulls_forbid`, and
+            :func:`nulls_separate`). The default value is :func:`nulls_forbid`.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch({"col1": [1, 2, 3], "col2": ["a", "b", 
"c"]})
+        >>> names, columns = na.Array(batch).to_column_list()
+        >>> names
+        ['col1', 'col2']
+        >>> columns
+        [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3), ['a', 'b', 'c']]
+        """
+        return ColumnsBuilder.visit(self, handle_nulls=handle_nulls)
+
+    def to_column(self, handle_nulls=None) -> Sequence:
+        """Convert to a contiguous sequence
+
+        Converts a stream of arrays into a columnar representation
+        such that each column is either a contiguous buffer or a ``list``.
+        Integer, float, and interval arrays are currently converted to their
+        contiguous buffer representation; other types are returned as a list
+        of Python objects. The sequences returned by :meth:`to_column` are
+        designed to work as input to ``pandas.Series`` and/or 
``numpy.array()``.
+
+        Parameters
+        ----------
+        handle_nulls : callable
+            A function returning a sequence based on a validity bytemap and a
+            contiguous buffer of values. If the array contains no nulls, the
+            validity bytemap will be ``None``. Built-in handlers include
+            :func:`nulls_as_sentinel`, :func:`nulls_forbid`, and
+            :func:`nulls_separate`). The default value is :func:`nulls_forbid`.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> na.Array([1, 2, 3], na.int32()).to_column()
+        nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+        """
+        return SingleColumnBuilder.visit(self, handle_nulls=handle_nulls)
+
+
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+    """Erroring null handler
 
-    Paramters
-    ---------
-    obj : array stream-like
-        An array-like or array stream-like object as sanitized by
-        :func:`c_array_stream`.
-    schema : schema-like, optional
-        An optional schema, passed to :func:`c_array_stream`.
+    A null handler that errors when it encounters nulls.
 
     Examples
     --------
 
     >>> import nanoarrow as na
-    >>> from nanoarrow import visitor
-    >>> array = na.c_array([1, 2, 3], na.int32())
-    >>> visitor.to_pylist(array)
-    [1, 2, 3]
+    >>> na.Array([1, 2, 3], na.int32()).to_column(na.nulls_forbid())
+    nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+    >>> na.Array([1, None, 3], na.int32()).to_column(na.nulls_forbid())
+    Traceback (most recent call last):
+    ...
+    ValueError: Null present with null_handler=nulls_forbid()
     """
-    return ListBuilder.visit(obj, schema)
+
+    def handle(is_valid, data):
+        if is_valid is not None:

Review Comment:
   ```suggestion
           # the is_valid bytemap is only created if there was at least one null
           if is_valid is not None:
   ```



##########
python/src/nanoarrow/visitor.py:
##########
@@ -124,27 +242,26 @@ def finish(self) -> Any:
         return None
 
 
-class ListBuilder(ArrayStreamVisitor):
-    def __init__(self, schema, *, iterator_cls=PyIterator, array_view=None):
+class SingleColumnBuilder(ArrayStreamVisitor):
+    def __init__(self, schema, handle_nulls=None, *, array_view=None):
         super().__init__(schema, array_view=array_view)
-
-        # Ensure that self._iterator._array_view is self._array_view
-        self._iterator = iterator_cls(schema, array_view=self._array_view)
+        cls, kwargs = _resolve_column_builder_cls(
+            self._schema, handle_nulls=handle_nulls
+        )
+        self._visitor = cls(schema, **kwargs, array_view=self._array_view)
 
     def begin(self, total_elements: Union[int, None] = None):
-        self._lst = []
+        self._visitor.begin(total_elements)
 
-    def visit_chunk_view(self, array_view: CArrayView):
-        # The constructor here ensured that self._iterator._array_view
-        # is populated when self._set_array() is called.
-        self._lst.extend(self._iterator)
+    def visit_chunk_view(self, array_view: CArrayView) -> None:
+        self._visitor.visit_chunk_view(array_view)
 
-    def finish(self) -> List:
-        return self._lst
+    def finish(self) -> Any:
+        return self._visitor.finish()
 
 
 class ColumnsBuilder(ArrayStreamVisitor):

Review Comment:
   FWIW, and I don't directly have a better idea, but I found the usage of 
"columns" somewhat confusing here (and also "Builder", because that sounds to 
similar as the builders to build arrays/buffers in the python->arrow 
direction). Naming is hard .. ;)



##########
python/src/nanoarrow/visitor.py:
##########
@@ -15,68 +15,185 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Sequence, Tuple, Union
 
-from nanoarrow._lib import CArrayView
+from nanoarrow._lib import CArrayView, CArrowType, CBuffer, CBufferBuilder
 from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema_view
 from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
 from nanoarrow.schema import Type
 
 
-def to_pylist(obj, schema=None) -> List:
-    """Convert ``obj`` to a ``list()` of Python objects
+class ArrayViewVisitable:
+    """Mixin class providing conversion methods based on visitors
 
-    Computes an identical value to ``list(iterator.iter_py())`` but is several
-    times faster.
+    Can be used with classes that implement ``__arrow_c_stream__()``
+    or ``__arrow_c_array__()``.
+    """
+
+    def to_pylist(self) -> List:
+        """Convert to a ``list`` of Python objects
+
+        Computes an identical value to ``list(iter_py())`` but can be much
+        faster.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> from nanoarrow import visitor
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.to_pylist()
+        [1, 2, 3]
+        """
+        return ListBuilder.visit(self)
+
+    def to_column_list(self, handle_nulls=None) -> Tuple[List[str], 
List[Sequence]]:
+        """Convert to a ``list`` of contiguous sequences
+
+        Converts a stream of struct arrays into its column-wise representation
+        according to :meth:`to_column`.
+
+        Paramters
+        ---------
+        handle_nulls : callable
+            A function returning a sequence based on a validity bytemap and a
+            contiguous buffer of values. If the array contains no nulls, the
+            validity bytemap will be ``None``. Built-in handlers include
+            :func:`nulls_as_sentinel`, :func:`nulls_forbid`, and
+            :func:`nulls_separate`). The default value is :func:`nulls_forbid`.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch({"col1": [1, 2, 3], "col2": ["a", "b", 
"c"]})
+        >>> names, columns = na.Array(batch).to_column_list()
+        >>> names
+        ['col1', 'col2']
+        >>> columns
+        [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3), ['a', 'b', 'c']]
+        """
+        return ColumnsBuilder.visit(self, handle_nulls=handle_nulls)
+
+    def to_column(self, handle_nulls=None) -> Sequence:
+        """Convert to a contiguous sequence
+
+        Converts a stream of arrays into a columnar representation
+        such that each column is either a contiguous buffer or a ``list``.
+        Integer, float, and interval arrays are currently converted to their
+        contiguous buffer representation; other types are returned as a list
+        of Python objects. The sequences returned by :meth:`to_column` are
+        designed to work as input to ``pandas.Series`` and/or 
``numpy.array()``.
+
+        Parameters
+        ----------
+        handle_nulls : callable
+            A function returning a sequence based on a validity bytemap and a
+            contiguous buffer of values. If the array contains no nulls, the
+            validity bytemap will be ``None``. Built-in handlers include
+            :func:`nulls_as_sentinel`, :func:`nulls_forbid`, and
+            :func:`nulls_separate`). The default value is :func:`nulls_forbid`.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> na.Array([1, 2, 3], na.int32()).to_column()
+        nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+        """
+        return SingleColumnBuilder.visit(self, handle_nulls=handle_nulls)
+
+
+def nulls_forbid() -> Callable[[CBuffer, Sequence], Sequence]:
+    """Erroring null handler
 
-    Paramters
-    ---------
-    obj : array stream-like
-        An array-like or array stream-like object as sanitized by
-        :func:`c_array_stream`.
-    schema : schema-like, optional
-        An optional schema, passed to :func:`c_array_stream`.
+    A null handler that errors when it encounters nulls.
 
     Examples
     --------
 
     >>> import nanoarrow as na
-    >>> from nanoarrow import visitor
-    >>> array = na.c_array([1, 2, 3], na.int32())
-    >>> visitor.to_pylist(array)
-    [1, 2, 3]
+    >>> na.Array([1, 2, 3], na.int32()).to_column(na.nulls_forbid())
+    nanoarrow.c_lib.CBuffer(int32[12 b] 1 2 3)
+    >>> na.Array([1, None, 3], na.int32()).to_column(na.nulls_forbid())
+    Traceback (most recent call last):
+    ...
+    ValueError: Null present with null_handler=nulls_forbid()
     """
-    return ListBuilder.visit(obj, schema)
+
+    def handle(is_valid, data):
+        if is_valid is not None:
+            raise ValueError("Null present with null_handler=nulls_forbid()")
+
+        return data
+
+    return handle
 
 
-def to_columns(obj, schema=None) -> Tuple[List[str], List[Sequence]]:
-    """Convert ``obj`` to a ``list()` of sequences
+def nulls_as_sentinel(sentinel=None):
+    """Sentinel null handler
 
-    Converts a stream of struct arrays into its column-wise representation
-    such that each column is either a contiguous buffer or a ``list()``.
+    A null handler that assigns a sentinel to null values. This is
+    done using numpy using the expression ``data[~is_valid] = sentinel``.
+    The default sentinel value of ``None`` will result in ``nan``
+    assigned to null values in numeric and boolean outputs. This

Review Comment:
   ```suggestion
       The default sentinel value of ``None`` will result in float output and 
``nan``
       assigned to null values for numeric and boolean inputs. This
   ```



##########
python/src/nanoarrow/visitor.py:
##########
@@ -156,18 +274,23 @@ def __init__(self, schema, *, array_view=None):
             self._schema.children, self._array_view.children
         ):
             self._child_visitors.append(
-                self._resolve_child_visitor(child_schema, child_array_view)
+                self._resolve_child_visitor(
+                    child_schema, child_array_view, handle_nulls
+                )
             )
 
-    def _resolve_child_visitor(self, child_schema, child_array_view):
-        # TODO: Resolve more efficient column builders for single-buffer types
-        return ListBuilder(child_schema, array_view=child_array_view)
+    def _resolve_child_visitor(self, child_schema, child_array_view, 
handle_nulls):
+        cls, kwargs = _resolve_column_builder_cls(child_schema, handle_nulls)
+        return cls(child_schema, **kwargs, array_view=child_array_view)
 
     def begin(self, total_elements: Union[int, None] = None) -> None:
         for child_visitor in self._child_visitors:
             child_visitor.begin(total_elements)
 
     def visit_chunk_view(self, array_view: CArrayView) -> Any:
+        if array_view.null_count > 0:
+            raise ValueError("null_count > 0 encountered in ColumnsBuilder")

Review Comment:
   Can you add a brief comment about this in the code?



##########
python/src/nanoarrow/visitor.py:
##########
@@ -15,68 +15,186 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Sequence, Tuple, Union
 
-from nanoarrow._lib import CArrayView
+from nanoarrow._lib import CArrayView, CArrowType, CBuffer, CBufferBuilder
 from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema_view
 from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
 from nanoarrow.schema import Type
 
 
-def to_pylist(obj, schema=None) -> List:
-    """Convert ``obj`` to a ``list()` of Python objects
+class ArrayViewVisitable:
+    """Mixin class providing conversion methods based on visitors
+
+    Can be used with classes that implement ``__arrow_c_stream__()``
+    or ``__arrow_c_array__()``.
+    """
+
+    def to_pylist(self) -> List:
+        """Convert to a ``list()`` of Python objects
+
+        Computes an identical value to ``list(iter_py())`` but can be much
+        faster.
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> from nanoarrow import visitor
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.to_pylist()
+        [1, 2, 3]
+        """
+        return ListBuilder.visit(self)
+
+    def to_column_list(self, handle_nulls=None) -> Tuple[List[str], 
List[Sequence]]:
+        """Convert to a ``list()` of contiguous sequences
+
+        Converts a stream of struct arrays into its column-wise representation
+        according to :meth:`to_column`.
+
+        Paramters
+        ---------
+        handle_nulls : callable
+            A function returning a sequence based on a validity bytemap and a
+            contiguous buffer of values (e.g., the callable returned by
+            :meth:`nulls_as_sentinel`).
+
+        Examples
+        --------
+
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch({"col1": [1, 2, 3], "col2": ["a", "b", 
"c"]})
+        >>> names, columns = na.Array(batch).to_column_list()
+        >>> names
+        ['col1', 'col2']
+        >>> columns
+        [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3), ['a', 'b', 'c']]
+        """
+        return ColumnsBuilder.visit(self, handle_nulls=handle_nulls)
+
+    def to_column(self, handle_nulls=None) -> Sequence:
+        """Convert to a contiguous sequence
+
+        Converts a stream of arrays into a columnar representation
+        such that each column is either a contiguous buffer or a ``list()``.
+        Integer, float, and interval arrays are currently converted to their
+        contiguous buffer representation; other types are returned as a list
+        of Python objects. The sequences returned by :meth:`to_column` are
+        designed to work as input to ``pandas.Series`` and/or 
``numpy.array()``.
+
+        Parameters
+        ---------
+        obj : array stream-like
+            An array-like or array stream-like object as sanitized by
+            :func:`c_array_stream`.
+        schema : schema-like, optional
+            An optional schema, passed to :func:`c_array_stream`.
+        handle_nulls : callable

Review Comment:
   This keyword is only used when the output is a buffer and not when it is a 
list?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to