Re: [PR] feat(python): Add repr iterator [arrow-nanoarrow]

via GitHub Mon, 04 Mar 2024 15:33:56 -0800


danepitkin commented on code in PR #397:
URL: https://github.com/apache/arrow-nanoarrow/pull/397#discussion_r1511898186



##########
python/src/nanoarrow/iterator.py:
##########
@@ -83,8 +84,42 @@ def itertuples(obj, schema=None) -> Iterable[Tuple]:
     return RowTupleIterator.get_iterator(obj, schema=schema)
 
 
+def iterrepr(obj, schema=None, max_width: int = 80) -> Iterable[str]:
+    """Iterator of reprs with bounded size for each item
+
+    Whereas the the RowTupleIterator and the RowIterator are optimized for
+    converting the entire input to Python, this iterator is defensive
+    about materializing elements as Python objects and in most cases will
+    not materialize a value or child value in its entirity once ``max_width``
+    characters have already been materialized.
+
+    Paramters

Review Comment:
   ```suggestion
       Parameters
   ```



##########
python/src/nanoarrow/iterator.py:
##########
@@ -83,8 +84,42 @@ def itertuples(obj, schema=None) -> Iterable[Tuple]:
     return RowTupleIterator.get_iterator(obj, schema=schema)
 
 
+def iterrepr(obj, schema=None, max_width: int = 80) -> Iterable[str]:

Review Comment:
   Hmm, how would you feel about approaching this a different way? Instead of 
   ```
       >>> for item in iterator.iterrepr(array, max_width=9):
       ...     print(item)
   ```
   we could potentially do something like
   ```
       >>> for item in array:
       ...     print(item.to_string(max_width=9))
   ```
   This would be more of a bottoms-up approach rather than top-down, where each 
data type has its own `to_string()` method that gets called and we don't need 
to keep track of the nested depth in an auxiliary class like `ReprIterator`



##########
python/src/nanoarrow/iterator.py:
##########
@@ -276,6 +301,217 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+class ReprIterator(RowIterator):
+    """Iterate over elements by materializing as little of each element
+    as possible. This is not designed to be a general-purpose formatter;
+    however, it is designed to allow the nanoarrow.Array class to have a
+    reasonable repr() method that doesn't overflow a console or hang when
+    printing arbitrary input.
+    """
+
+    def __init__(self, schema, *, _array_view=None, max_width=120, out=None):
+        # Pass the same instance of out to all children. If unspecified,
+        # this is the top-level iterator that will request the value from
+        # out for each item.
+        if out is None:
+            self._out = ReprBuilder(max_width)
+            self._top_level = True
+        else:
+            self._out = out
+            self._top_level = False
+
+        super().__init__(schema, _array_view=_array_view)
+
+    def _make_child(self, schema, array_view):
+        # Ensure children all refer to the same instance of the ReprBuilder
+        return ReprIterator(schema, _array_view=array_view, out=self._out)
+
+    def _iter1(self, offset, length):
+        # The iterator resolved by the RowIterator here will yield False
+        # if self._out.full(). This pattern allows arbitrary levels of
+        # recursion for nested types while allowing child iterators to signal
+        # to higher levels that there is no need to write any further
+        # values for the top-level element.
+        parent = super()._iter1(offset, length)
+
+        # If we are at the top level, yield the (potentially abbreviated)
+        # repr() string for each element. For child elements, yield
+        # False if self._out.full() or True otherwise.
+        if self._top_level:
+            return self._repr_wrapper(parent)
+        else:
+            return parent
+
+    def _repr_wrapper(self, parent):
+        # A wrapper that iterates over the parent (where consuming a value of 
the
+        # iterator writes to self._out)
+        for _ in parent:
+            yield self._out.finish()
+
+    def _dictionary_iter(self, offset, length):
+        # Rather than materialize the entire dictionary, materialize a single
+        # element as required. This is slower for materializing many values but
+        # typically this iterator will only materialize the number of rows on a
+        # console at most.
+        for dict_index in super()._primitive_iter(offset, length):
+            if dict_index is None:
+                yield self._out.write_null()
+            else:
+                dict_iter = self._dictionary._iter1(dict_index, 1)
+                yield next(dict_iter)
+
+    def _write_list_item(self, offset_plus_i, start, end):
+        # Whereas the RowIterator resolves one iterator for child and pulls
+        # end - start values from it for each element, this iterator resolves
+        # length-one child iterator for each element and consumes it. This
+        # pattern avoids materializing unnecessary elements.
+        validity = self._array_view.buffer(0)
+        item_is_valid = len(validity) == 0 or validity.element(offset_plus_i)
+        if not item_is_valid:
+            return self._out.write_null()
+
+        self._out.write("[")
+        for i in range(start, end):
+            if i > start:
+                self._out.write(", ")
+
+            child_iter = self._children[0]._iter1(i, 1)
+            if next(child_iter) is False:
+                return False
+
+        return self._out.write("]")
+
+    def _list_iter(self, offset, length):
+        offset += self._array_view.offset
+        offsets = memoryview(self._array_view.buffer(1))[offset : (offset + 
length + 1)]
+        starts = offsets[:-1]
+        ends = offsets[1:]
+
+        for i, start, end in zip(range(length), starts, ends):
+            yield self._write_list_item(offset + i, start, end)
+
+    def _fixed_size_list_iter(self, offset, length):
+        offset += self._array_view.offset
+        fixed_size = self._array_view.layout.child_size_elements
+        for i in range(length):
+            yield self._write_list_item(
+                offset + i, (offset + i) * fixed_size, (offset + i + 1) * 
fixed_size
+            )
+
+    def _write_struct_item(self, offset_plus_i):
+        # Whereas the RowIterator and RowTupleIterator materialize all child
+        # iterators and always materialize a value for every child element,
+        # this pattern resolves a length-one iterator for every child, consumes
+        # it, and stops iterating if there are many columns. Very wide tables
+        # will still incur a full loop over children at the C level (e.g., in
+        # ArrowArrayViewInitFromSchema() and ArrowArrayViewSetArray()); 
however,
+        # should not incur any Python calls in a full loop over children.
+        validity = self._array_view.buffer(0)
+        item_is_valid = len(validity) == 0 or validity.element(offset_plus_i)
+        if not item_is_valid:
+            return self._out.write_null()
+
+        self._out.write("{")
+        for i, child in enumerate(self._children):
+            if i > 0:
+                self._out.write(", ")
+
+            child_name = child._schema.name
+            if self._out.write(repr(child_name)) is False:
+                return False
+
+            self._out.write(": ")
+
+            child_iter = child._iter1(offset_plus_i, 1)
+            if next(child_iter) is False:
+                return False
+
+        return self._out.write("}")
+
+    def _struct_iter(self, offset, length):
+        offset += self._array_view.offset
+        for i in range(length):
+            yield self._write_struct_item(offset + i)
+
+    def _string_iter(self, offset, length):
+        # A variant of iterating over strings that ensures that very large
+        # elements are not fully materialized as a Python object.
+        memoryviews = super()._binary_view_iter(offset, length)
+
+        # In the end-member scenario where every code point is four bytes,
+        # ensure we still slice enough bytes to fill max_width. Give some
+        # bytes on the end in case the last byte is the beginning of a
+        # multibyte character and to ensure we never get the trailing quote
+        # for an incomplete repr
+        max_width_slice_bytes = (self._out._max_size + 2) * 4
+        for mv in memoryviews:
+            if mv is None:
+                yield self._out.write_null()
+            else:
+                str_begin = bytes(mv[:max_width_slice_bytes]).decode()
+                yield self._out.write(repr(str_begin))
+
+    def _binary_iter(self, offset, length):
+        # A variant of iterating over strings that ensures that very large
+        # elements are not fully materialized as a Python object.
+
+        memoryviews = super()._binary_view_iter(offset, length)
+        # Give some extra bytes to ensure we never get a trailing ' for an
+        # incomplete repr.
+        max_width_slice_bytes = self._out._max_size + 4
+        for mv in memoryviews:
+            if mv is None:
+                yield self._out.write_null()
+            else:
+                yield self._out.write(repr(bytes(mv[:max_width_slice_bytes])))
+
+    def _primitive_iter(self, offset, length):
+        # These types can have relatively long reprs (e.g., large int64) but 
they
+        # are never so large that they can hang the repr (with the possible
+        # exception of fixed-size binary in rare cases).
+        for item in super()._primitive_iter(offset, length):
+            if item is None:
+                yield self._out.write_null()
+            else:
+                yield self._out.write(repr(item))
+
+
+class ReprBuilder:
+    """Stateful string builder for building repr strings
+
+    A wrapper around io.StringIO() that keeps track of how many
+    characters have been written vs some maximum size.
+    """
+
+    def __init__(self, max_size) -> None:
+        self._out = StringIO()
+        self._max_size = max_size
+        self._size = 0
+
+    def full(self):
+        return self._size > self._max_size
+
+    def write_null(self):
+        return self.write(repr(None))
+
+    def write(self, content):
+        """Write string content. Returns True if there are not yet max_size
+        characters in this items's repr and False otherwise."""
+        self._out.write(content)

Review Comment:
   What if instead we only write the first bytes of content equal to 
`self._max_size - self._size`? This could get rid of unnecessary writing of 
data that would otherwise be truncated in `finish()`



##########
python/src/nanoarrow/iterator.py:
##########
@@ -276,6 +301,217 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+class ReprIterator(RowIterator):
+    """Iterate over elements by materializing as little of each element
+    as possible. This is not designed to be a general-purpose formatter;
+    however, it is designed to allow the nanoarrow.Array class to have a
+    reasonable repr() method that doesn't overflow a console or hang when
+    printing arbitrary input.
+    """
+
+    def __init__(self, schema, *, _array_view=None, max_width=120, out=None):
+        # Pass the same instance of out to all children. If unspecified,
+        # this is the top-level iterator that will request the value from
+        # out for each item.
+        if out is None:
+            self._out = ReprBuilder(max_width)
+            self._top_level = True
+        else:
+            self._out = out
+            self._top_level = False
+
+        super().__init__(schema, _array_view=_array_view)
+
+    def _make_child(self, schema, array_view):
+        # Ensure children all refer to the same instance of the ReprBuilder
+        return ReprIterator(schema, _array_view=array_view, out=self._out)
+
+    def _iter1(self, offset, length):
+        # The iterator resolved by the RowIterator here will yield False
+        # if self._out.full(). This pattern allows arbitrary levels of
+        # recursion for nested types while allowing child iterators to signal
+        # to higher levels that there is no need to write any further
+        # values for the top-level element.
+        parent = super()._iter1(offset, length)
+
+        # If we are at the top level, yield the (potentially abbreviated)
+        # repr() string for each element. For child elements, yield
+        # False if self._out.full() or True otherwise.
+        if self._top_level:
+            return self._repr_wrapper(parent)
+        else:
+            return parent
+
+    def _repr_wrapper(self, parent):
+        # A wrapper that iterates over the parent (where consuming a value of 
the
+        # iterator writes to self._out)
+        for _ in parent:
+            yield self._out.finish()
+
+    def _dictionary_iter(self, offset, length):
+        # Rather than materialize the entire dictionary, materialize a single
+        # element as required. This is slower for materializing many values but
+        # typically this iterator will only materialize the number of rows on a
+        # console at most.
+        for dict_index in super()._primitive_iter(offset, length):
+            if dict_index is None:
+                yield self._out.write_null()
+            else:
+                dict_iter = self._dictionary._iter1(dict_index, 1)
+                yield next(dict_iter)
+
+    def _write_list_item(self, offset_plus_i, start, end):
+        # Whereas the RowIterator resolves one iterator for child and pulls
+        # end - start values from it for each element, this iterator resolves
+        # length-one child iterator for each element and consumes it. This
+        # pattern avoids materializing unnecessary elements.
+        validity = self._array_view.buffer(0)
+        item_is_valid = len(validity) == 0 or validity.element(offset_plus_i)
+        if not item_is_valid:
+            return self._out.write_null()
+
+        self._out.write("[")
+        for i in range(start, end):
+            if i > start:
+                self._out.write(", ")
+
+            child_iter = self._children[0]._iter1(i, 1)
+            if next(child_iter) is False:
+                return False
+
+        return self._out.write("]")
+
+    def _list_iter(self, offset, length):
+        offset += self._array_view.offset
+        offsets = memoryview(self._array_view.buffer(1))[offset : (offset + 
length + 1)]
+        starts = offsets[:-1]
+        ends = offsets[1:]
+
+        for i, start, end in zip(range(length), starts, ends):
+            yield self._write_list_item(offset + i, start, end)
+
+    def _fixed_size_list_iter(self, offset, length):
+        offset += self._array_view.offset
+        fixed_size = self._array_view.layout.child_size_elements
+        for i in range(length):
+            yield self._write_list_item(
+                offset + i, (offset + i) * fixed_size, (offset + i + 1) * 
fixed_size
+            )
+
+    def _write_struct_item(self, offset_plus_i):
+        # Whereas the RowIterator and RowTupleIterator materialize all child
+        # iterators and always materialize a value for every child element,
+        # this pattern resolves a length-one iterator for every child, consumes
+        # it, and stops iterating if there are many columns. Very wide tables
+        # will still incur a full loop over children at the C level (e.g., in
+        # ArrowArrayViewInitFromSchema() and ArrowArrayViewSetArray()); 
however,
+        # should not incur any Python calls in a full loop over children.
+        validity = self._array_view.buffer(0)
+        item_is_valid = len(validity) == 0 or validity.element(offset_plus_i)
+        if not item_is_valid:
+            return self._out.write_null()
+
+        self._out.write("{")
+        for i, child in enumerate(self._children):
+            if i > 0:
+                self._out.write(", ")
+
+            child_name = child._schema.name
+            if self._out.write(repr(child_name)) is False:
+                return False
+
+            self._out.write(": ")
+
+            child_iter = child._iter1(offset_plus_i, 1)
+            if next(child_iter) is False:
+                return False
+
+        return self._out.write("}")
+
+    def _struct_iter(self, offset, length):
+        offset += self._array_view.offset
+        for i in range(length):
+            yield self._write_struct_item(offset + i)
+
+    def _string_iter(self, offset, length):
+        # A variant of iterating over strings that ensures that very large
+        # elements are not fully materialized as a Python object.
+        memoryviews = super()._binary_view_iter(offset, length)
+
+        # In the end-member scenario where every code point is four bytes,
+        # ensure we still slice enough bytes to fill max_width. Give some
+        # bytes on the end in case the last byte is the beginning of a
+        # multibyte character and to ensure we never get the trailing quote
+        # for an incomplete repr
+        max_width_slice_bytes = (self._out._max_size + 2) * 4
+        for mv in memoryviews:
+            if mv is None:
+                yield self._out.write_null()
+            else:
+                str_begin = bytes(mv[:max_width_slice_bytes]).decode()
+                yield self._out.write(repr(str_begin))
+
+    def _binary_iter(self, offset, length):
+        # A variant of iterating over strings that ensures that very large
+        # elements are not fully materialized as a Python object.
+
+        memoryviews = super()._binary_view_iter(offset, length)
+        # Give some extra bytes to ensure we never get a trailing ' for an
+        # incomplete repr.
+        max_width_slice_bytes = self._out._max_size + 4
+        for mv in memoryviews:
+            if mv is None:
+                yield self._out.write_null()
+            else:
+                yield self._out.write(repr(bytes(mv[:max_width_slice_bytes])))
+
+    def _primitive_iter(self, offset, length):
+        # These types can have relatively long reprs (e.g., large int64) but 
they
+        # are never so large that they can hang the repr (with the possible
+        # exception of fixed-size binary in rare cases).
+        for item in super()._primitive_iter(offset, length):
+            if item is None:
+                yield self._out.write_null()
+            else:
+                yield self._out.write(repr(item))
+
+
+class ReprBuilder:
+    """Stateful string builder for building repr strings
+
+    A wrapper around io.StringIO() that keeps track of how many
+    characters have been written vs some maximum size.
+    """
+
+    def __init__(self, max_size) -> None:
+        self._out = StringIO()
+        self._max_size = max_size
+        self._size = 0
+
+    def full(self):
+        return self._size > self._max_size
+
+    def write_null(self):
+        return self.write(repr(None))
+
+    def write(self, content):
+        """Write string content. Returns True if there are not yet max_size
+        characters in this items's repr and False otherwise."""
+        self._out.write(content)
+        self._size += len(content)
+        return not self.full()

Review Comment:
   Typically, I expect a `write()` API to return nothing or return True only if 
the call succeeded. I'd recommend leaving the caller to check `self.full()`.



##########
python/src/nanoarrow/iterator.py:
##########
@@ -276,6 +301,217 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+class ReprIterator(RowIterator):
+    """Iterate over elements by materializing as little of each element
+    as possible. This is not designed to be a general-purpose formatter;
+    however, it is designed to allow the nanoarrow.Array class to have a
+    reasonable repr() method that doesn't overflow a console or hang when
+    printing arbitrary input.
+    """
+
+    def __init__(self, schema, *, _array_view=None, max_width=120, out=None):
+        # Pass the same instance of out to all children. If unspecified,
+        # this is the top-level iterator that will request the value from
+        # out for each item.
+        if out is None:
+            self._out = ReprBuilder(max_width)
+            self._top_level = True
+        else:
+            self._out = out
+            self._top_level = False
+
+        super().__init__(schema, _array_view=_array_view)
+
+    def _make_child(self, schema, array_view):
+        # Ensure children all refer to the same instance of the ReprBuilder
+        return ReprIterator(schema, _array_view=array_view, out=self._out)
+
+    def _iter1(self, offset, length):

Review Comment:
   Can we find a function name that is more descriptive? I find this particular 
function to be quite complex.



##########
python/src/nanoarrow/iterator.py:
##########
@@ -276,6 +301,217 @@ def _iter1(self, offset, length):
         return self._struct_tuple_iter(offset, length)
 
 
+class ReprIterator(RowIterator):
+    """Iterate over elements by materializing as little of each element
+    as possible. This is not designed to be a general-purpose formatter;
+    however, it is designed to allow the nanoarrow.Array class to have a
+    reasonable repr() method that doesn't overflow a console or hang when
+    printing arbitrary input.
+    """
+
+    def __init__(self, schema, *, _array_view=None, max_width=120, out=None):
+        # Pass the same instance of out to all children. If unspecified,
+        # this is the top-level iterator that will request the value from
+        # out for each item.
+        if out is None:
+            self._out = ReprBuilder(max_width)
+            self._top_level = True
+        else:
+            self._out = out

Review Comment:
   Why would a caller want to specify their own `out`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python): Add repr iterator [arrow-nanoarrow]

Reply via email to