danepitkin commented on code in PR #454:
URL: https://github.com/apache/arrow-nanoarrow/pull/454#discussion_r1597196922


##########
python/tests/test_visitor.py:
##########
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import nanoarrow as na
+from nanoarrow import visitor
+
+
+def test_to_pylist():
+    assert visitor.to_pylist([1, 2, 3], na.int32()) == [1, 2, 3]

Review Comment:
   This feels like an odd test because the input and output are technically 
both pylists. Would it be better to build an array and use that?



##########
python/src/nanoarrow/visitor.py:
##########
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, List, Sequence, Tuple, Union
+
+from nanoarrow._lib import CArrayView
+from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
+from nanoarrow.schema import Type
+
+
+def to_pylist(obj, schema=None) -> List:
+    """Convert ``obj`` to a ``list()` of Python objects
+
+    Computes an identical value to ``list(iterator.iter_py())`` but is several
+    times faster.
+
+    Paramters
+    ---------
+    obj : array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> from nanoarrow import visitor
+    >>> array = na.c_array([1, 2, 3], na.int32())
+    >>> visitor.to_pylist(array)
+    [1, 2, 3]
+    """
+    return ListBuilder.visit(obj, schema)
+
+
+def to_columns(obj, schema=None) -> Tuple[List[str], List[Sequence]]:
+    """Convert ``obj`` to a ``list()` of sequences
+
+    Converts a stream of struct arrays into its column-wise representation
+    such that each column is either a contiguous buffer or a ``list()``.
+
+    Paramters
+    ---------
+    obj : array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> from nanoarrow import visitor
+    >>> import pyarrow as pa
+    >>> array = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
+    >>> names, columns = visitor.to_columns(array)
+    >>> names
+    ['col1']
+    >>> columns
+    [[1, 2, 3]]
+    """
+    return ColumnsBuilder.visit(obj, schema)
+
+
+class ArrayStreamVisitor(ArrayViewBaseIterator):
+    """Compute a value from one or more arrays in an ArrowArrayStream
+
+    This class supports a (currently internal) pattern for building
+    output from a zero or more arrays in a stream.
+
+    """
+
+    @classmethod
+    def visit(cls, obj, schema=None, total_elements=None, **kwargs):
+        """Visit all chunks in ``obj`` as a :func:`c_array_stream`."""
+
+        if total_elements is None and hasattr(obj, "__len__"):
+            total_elements = len(obj)
+
+        with c_array_stream(obj, schema=schema) as stream:
+            visitor = cls(stream._get_cached_schema(), **kwargs)
+            visitor.begin(total_elements)
+
+            visitor_set_array = visitor._set_array
+            visit_chunk_view = visitor.visit_chunk_view
+            array_view = visitor._array_view
+
+            for array in stream:
+                visitor_set_array(array)
+                visit_chunk_view(array_view)
+
+        return visitor.finish()
+
+    def begin(self, total_elements: Union[int, None] = None):
+        """Called after the schema has been resolved but before any
+        chunks have been visited. If the total number of elements
+        (i.e., the sum of all chunk lengths) is known, it is provided here.
+        """
+        pass

Review Comment:
   Maybe raise `NotImplementedError()` here? Alternatively, you could also look 
into https://docs.python.org/3/library/abc.html and abstract methods.



##########
python/src/nanoarrow/iterator.py:
##########
@@ -199,7 +175,41 @@ class PyIterator(ArrayViewBaseIterator):
     Intended for internal use.
     """
 
+    @classmethod
+    def get_iterator(cls, obj, schema=None):
+        with c_array_stream(obj, schema=schema) as stream:
+            iterator = cls(stream._get_cached_schema())
+            for array in stream:
+                iterator._set_array(array)
+                yield from iterator
+
+    def __init__(self, schema, *, _array_view=None):

Review Comment:
   nit: `_array_view` -> `array_view` as a param name (also applies to the base 
class)



##########
python/src/nanoarrow/visitor.py:
##########
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, List, Sequence, Tuple, Union
+
+from nanoarrow._lib import CArrayView
+from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
+from nanoarrow.schema import Type
+
+
+def to_pylist(obj, schema=None) -> List:

Review Comment:
   I think my personal preference would be to have `to_pylist` and `to_columns` 
be class APIs instead of helper functions. e.g. 
   
   ```
       >>> import nanoarrow as na
       >>> array = na.c_array([1, 2, 3], na.int32())
       >>> array.to_pylist()
   ```
   
   You could put these two apis in an abstract base class and add the base 
class to other classes. WDYT? 



##########
python/src/nanoarrow/visitor.py:
##########
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any, List, Sequence, Tuple, Union
+
+from nanoarrow._lib import CArrayView
+from nanoarrow.c_array_stream import c_array_stream
+from nanoarrow.iterator import ArrayViewBaseIterator, PyIterator
+from nanoarrow.schema import Type
+
+
+def to_pylist(obj, schema=None) -> List:
+    """Convert ``obj`` to a ``list()` of Python objects
+
+    Computes an identical value to ``list(iterator.iter_py())`` but is several
+    times faster.
+
+    Paramters
+    ---------
+    obj : array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> from nanoarrow import visitor
+    >>> array = na.c_array([1, 2, 3], na.int32())
+    >>> visitor.to_pylist(array)
+    [1, 2, 3]
+    """
+    return ListBuilder.visit(obj, schema)
+
+
+def to_columns(obj, schema=None) -> Tuple[List[str], List[Sequence]]:
+    """Convert ``obj`` to a ``list()` of sequences
+
+    Converts a stream of struct arrays into its column-wise representation
+    such that each column is either a contiguous buffer or a ``list()``.
+
+    Paramters
+    ---------
+    obj : array stream-like
+        An array-like or array stream-like object as sanitized by
+        :func:`c_array_stream`.
+    schema : schema-like, optional
+        An optional schema, passed to :func:`c_array_stream`.
+
+    Examples
+    --------
+
+    >>> import nanoarrow as na
+    >>> from nanoarrow import visitor
+    >>> import pyarrow as pa
+    >>> array = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
+    >>> names, columns = visitor.to_columns(array)
+    >>> names
+    ['col1']
+    >>> columns
+    [[1, 2, 3]]
+    """
+    return ColumnsBuilder.visit(obj, schema)
+
+
+class ArrayStreamVisitor(ArrayViewBaseIterator):
+    """Compute a value from one or more arrays in an ArrowArrayStream
+
+    This class supports a (currently internal) pattern for building
+    output from a zero or more arrays in a stream.
+
+    """
+
+    @classmethod
+    def visit(cls, obj, schema=None, total_elements=None, **kwargs):
+        """Visit all chunks in ``obj`` as a :func:`c_array_stream`."""
+
+        if total_elements is None and hasattr(obj, "__len__"):
+            total_elements = len(obj)
+
+        with c_array_stream(obj, schema=schema) as stream:
+            visitor = cls(stream._get_cached_schema(), **kwargs)
+            visitor.begin(total_elements)
+
+            visitor_set_array = visitor._set_array
+            visit_chunk_view = visitor.visit_chunk_view
+            array_view = visitor._array_view
+
+            for array in stream:
+                visitor_set_array(array)
+                visit_chunk_view(array_view)
+
+        return visitor.finish()
+
+    def begin(self, total_elements: Union[int, None] = None):
+        """Called after the schema has been resolved but before any
+        chunks have been visited. If the total number of elements
+        (i.e., the sum of all chunk lengths) is known, it is provided here.
+        """
+        pass
+
+    def visit_chunk_view(self, array_view: CArrayView) -> None:
+        """Called exactly one for each chunk seen."""
+        pass
+
+    def finish(self) -> Any:
+        """Called exactly once after all chunks have been visited."""
+        return None
+
+
+class ListBuilder(ArrayStreamVisitor):
+    def __init__(self, schema, *, iterator_cls=PyIterator, _array_view=None):

Review Comment:
   nit: `_array_view` -> `array_view` in ListBuilder/ColumnsBuilder



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to