Re: [PR] feat(python): Add column-wise buffer builder [arrow-nanoarrow]

via GitHub Fri, 17 May 2024 08:45:38 -0700


danepitkin commented on code in PR #464:
URL: https://github.com/apache/arrow-nanoarrow/pull/464#discussion_r1605193036



##########
python/src/nanoarrow/visitor.py:
##########
@@ -15,68 +15,192 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Sequence, Tuple, Union
 
-from nanoarrow._lib import CArrayView
+from nanoarrow._lib import CArrayView, CArrowType, CBuffer, CBufferBuilder
 from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.c_schema import c_schema_view
 from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
 from nanoarrow.schema import Type
 
 
-def to_pylist(obj, schema=None) -> List:
-    """Convert ``obj`` to a ``list()` of Python objects
+class ArrayViewVisitable:
+    """Mixin class providing conversion methods based on visitors
 
-    Computes an identical value to ``list(iterator.iter_py())`` but is several
-    times faster.
+    Can be used with classes that implement ``__arrow_c_stream__()``
+    or ``__arrow_c_array__()``.
+    """
+
+    def to_pylist(self) -> List:
+        """Convert to a ``list`` of Python objects
+
+        Computes an identical value to ``list(iter_py())`` but can be much
+        faster.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> from nanoarrow import visitor
+        >>> array = na.Array([1, 2, 3], na.int32())
+        >>> array.to_pylist()
+        [1, 2, 3]
+        """
+        return ListConverter.visit(self)
+
+    def convert_columns(self, *, handle_nulls=None) -> Tuple[List[str], 
List[Sequence]]:
+        """Convert to a ``list`` of contiguous sequences
+
+        Experimentally converts a stream of struct arrays into a list of 
contiguous
+        sequences using the same logic as :meth:`convert`.
+
+        Paramters
+        ---------
+        handle_nulls : callable
+            A function returning a sequence based on a validity bytemap and a
+            contiguous buffer of values. If the array contains no nulls, the
+            validity bytemap will be ``None``. Built-in handlers include
+            :func:`nulls_as_sentinel`, :func:`nulls_forbid`, and
+            :func:`nulls_separate`). The default value is :func:`nulls_forbid`.
+
+        Examples
+        --------
+        >>> import nanoarrow as na
+        >>> import pyarrow as pa
+        >>> batch = pa.record_batch({"col1": [1, 2, 3], "col2": ["a", "b", 
"c"]})
+        >>> names, columns = na.Array(batch).convert_columns()
+        >>> names
+        ['col1', 'col2']
+        >>> columns
+        [nanoarrow.c_lib.CBuffer(int64[24 b] 1 2 3), ['a', 'b', 'c']]
+        """
+        return ColumnListConverter.visit(self, handle_nulls=handle_nulls)
+
+    def convert(self, *, handle_nulls=None) -> Sequence:

Review Comment:
   I do like `convert`! My last recommendation would be `to_pysequence` 
instead. `to_pybuffer` would also work if pybuffer is split from pylist.



##########
python/src/nanoarrow/visitor.py:
##########
@@ -124,56 +248,209 @@ def finish(self) -> Any:
         return None
 
 
-class ListBuilder(ArrayStreamVisitor):
-    def __init__(self, schema, *, iterator_cls=PyIterator, array_view=None):
+class DispatchingConverter(ArrayStreamVisitor):
+    def __init__(self, schema, handle_nulls=None, *, array_view=None):
         super().__init__(schema, array_view=array_view)
-
-        # Ensure that self._iterator._array_view is self._array_view
-        self._iterator = iterator_cls(schema, array_view=self._array_view)
+        cls, kwargs = _resolve_converter_cls(self._schema, 
handle_nulls=handle_nulls)
+        self._visitor = cls(schema, **kwargs, array_view=self._array_view)
 
     def begin(self, total_elements: Union[int, None] = None):
-        self._lst = []
+        self._visitor.begin(total_elements)
 
-    def visit_chunk_view(self, array_view: CArrayView):
-        # The constructor here ensured that self._iterator._array_view
-        # is populated when self._set_array() is called.
-        self._lst.extend(self._iterator)
+    def visit_chunk_view(self, array_view: CArrayView) -> None:
+        self._visitor.visit_chunk_view(array_view)
 
-    def finish(self) -> List:
-        return self._lst
+    def finish(self) -> Any:
+        return self._visitor.finish()
 
 
-class ColumnsBuilder(ArrayStreamVisitor):
-    def __init__(self, schema, *, array_view=None):
+class ColumnListConverter(ArrayStreamVisitor):
+    def __init__(self, schema, handle_nulls=None, *, array_view=None):
         super().__init__(schema, array_view=array_view)
 
         if self.schema.type != Type.STRUCT:
-            raise ValueError("ColumnsBuilder can only be used on a struct 
array")
+            raise ValueError("ColumnListConverter can only be used on a struct 
array")
 
         # Resolve the appropriate visitor for each column
         self._child_visitors = []
         for child_schema, child_array_view in zip(
             self._schema.children, self._array_view.children
         ):
             self._child_visitors.append(
-                self._resolve_child_visitor(child_schema, child_array_view)
+                self._resolve_child_visitor(
+                    child_schema, child_array_view, handle_nulls
+                )
             )
 
-    def _resolve_child_visitor(self, child_schema, child_array_view):
-        # TODO: Resolve more efficient column builders for single-buffer types
-        return ListBuilder(child_schema, array_view=child_array_view)
+    def _resolve_child_visitor(self, child_schema, child_array_view, 
handle_nulls):
+        cls, kwargs = _resolve_converter_cls(child_schema, handle_nulls)
+        return cls(child_schema, **kwargs, array_view=child_array_view)
 
     def begin(self, total_elements: Union[int, None] = None) -> None:
         for child_visitor in self._child_visitors:
             child_visitor.begin(total_elements)
 
     def visit_chunk_view(self, array_view: CArrayView) -> Any:
+        # This visitor does not handle nulls because it has no way to 
propagate these
+        # into the child columns. It is designed to be used on top-level 
record batch
+        # arrays which typically are marked as non-nullable or do not contain 
nulls.
+        if array_view.null_count > 0:
+            raise ValueError("null_count > 0 encountered in 
ColumnListConverter")
+
         for child_visitor, child_array_view in zip(
             self._child_visitors, array_view.children
         ):
             child_visitor.visit_chunk_view(child_array_view)
 
     def finish(self) -> Tuple[List[str], List[Sequence]]:
-        return [v.schema.name for v in self._child_visitors], [
+        return [child.name for child in self._schema.children], [
             v.finish() for v in self._child_visitors
         ]
+
+
+class ListConverter(ArrayStreamVisitor):

Review Comment:
   What if we named these all `ArrayStreamTo<Type>Converter`?



##########
python/src/nanoarrow/visitor.py:
##########
@@ -124,56 +248,209 @@ def finish(self) -> Any:
         return None
 
 
-class ListBuilder(ArrayStreamVisitor):
-    def __init__(self, schema, *, iterator_cls=PyIterator, array_view=None):
+class DispatchingConverter(ArrayStreamVisitor):
+    def __init__(self, schema, handle_nulls=None, *, array_view=None):
         super().__init__(schema, array_view=array_view)
-
-        # Ensure that self._iterator._array_view is self._array_view
-        self._iterator = iterator_cls(schema, array_view=self._array_view)
+        cls, kwargs = _resolve_converter_cls(self._schema, 
handle_nulls=handle_nulls)
+        self._visitor = cls(schema, **kwargs, array_view=self._array_view)
 
     def begin(self, total_elements: Union[int, None] = None):
-        self._lst = []
+        self._visitor.begin(total_elements)
 
-    def visit_chunk_view(self, array_view: CArrayView):
-        # The constructor here ensured that self._iterator._array_view
-        # is populated when self._set_array() is called.
-        self._lst.extend(self._iterator)
+    def visit_chunk_view(self, array_view: CArrayView) -> None:
+        self._visitor.visit_chunk_view(array_view)
 
-    def finish(self) -> List:
-        return self._lst
+    def finish(self) -> Any:
+        return self._visitor.finish()
 
 
-class ColumnsBuilder(ArrayStreamVisitor):
-    def __init__(self, schema, *, array_view=None):
+class ColumnListConverter(ArrayStreamVisitor):

Review Comment:
   I still think naming this table or tabular would be more helpful to a user. 
Maybe `ArrayStreamToTabularConverter` or perhaps we define a `Table` typedef 
wrapper and call it `ArrayStreamToTableConverter`?
   ```
   Table = namedtuple('table', ['column_names', 'columns'])
   ```
   
   My main feedback about using "column" is that its a subcomponent of a table 
or matrix, but there isn't any tabular object in nanoarrow right now. For 
example, Pandas calls these "Series" and uses "column" terminology when the 
object is a dataframe.



##########
python/src/nanoarrow/visitor.py:
##########
@@ -124,56 +248,209 @@ def finish(self) -> Any:
         return None
 
 
-class ListBuilder(ArrayStreamVisitor):
-    def __init__(self, schema, *, iterator_cls=PyIterator, array_view=None):
+class DispatchingConverter(ArrayStreamVisitor):

Review Comment:
   I think I would prefer `ArrayStreamToPySequenceConverter` here. Sorry to 
continue nitpicking the names, this is one of the last ones that I would change 
if that helps.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add column-wise buffer builder [arrow-nanoarrow]

Reply via email to