jorisvandenbossche commented on code in PR #426:
URL: https://github.com/apache/arrow-nanoarrow/pull/426#discussion_r1565724432
##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -1904,23 +1945,49 @@ cdef class CBufferBuilder:
return out
def write_elements(self, obj):
Review Comment:
BTW it would be nice to add some docstrings at some point
##########
python/src/nanoarrow/_lib.pyx:
##########
@@ -1962,6 +2031,101 @@ cdef class CBufferBuilder:
return f"{class_label}({self.size_bytes}/{self.capacity_bytes})"
+cdef class NoneAwareWrapperIterator:
+ """Nullable iterator wrapper
+
+ This class wraps an iterable ``obj`` that might contain ``None`` values
+ such that the iterable provided by this class contains "empty" (but valid)
+ values. After ``obj`` has been completely consumed, one can call
+ ``finish()`` to obtain the resulting bitmap. This is useful for passing
+ iterables that might contain None to tools that cannot handle them
+ (e.g., struct.pack(), array.array()).
+ """
+ cdef ArrowBitmap _bitmap
+ cdef object _obj
+ cdef object _value_if_none
+ cdef int64_t _valid_count
+ cdef int64_t _item_count
+
+ def __cinit__(self, obj, type_id, item_size_bytes=0):
+ ArrowBitmapInit(&self._bitmap)
+ self._obj = iter(obj)
+
+ self._value_if_none = self._get_value_if_none(type_id, item_size_bytes)
+ self._valid_count = 0
+ self._item_count = 0
+
+ def __dealloc__(self):
+ ArrowBitmapReset(&self._bitmap)
+
+ def reserve(self, int64_t additional_elements):
+ cdef int code = ArrowBitmapReserve(&self._bitmap, additional_elements)
+ Error.raise_error_not_ok(self, code)
+
+ def _get_value_if_none(self, type_id, item_size_bytes=0):
+ if type_id == NANOARROW_TYPE_INTERVAL_DAY_TIME:
+ return (0, 0)
+ elif type_id == NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
+ return (0, 0, 0)
+ elif type_id == NANOARROW_TYPE_BOOL:
+ return False
+ elif type_id in (NANOARROW_TYPE_BINARY,
NANOARROW_TYPE_FIXED_SIZE_BINARY):
+ return b"\x00" * item_size_bytes
+ elif type_id in (NANOARROW_TYPE_HALF_FLOAT, NANOARROW_TYPE_FLOAT,
NANOARROW_TYPE_DOUBLE):
+ return 0.0
+ else:
+ return 0
+
+ cdef _append_to_validity(self, int is_valid):
+ self._valid_count += is_valid
+ self._item_count += 1
+
+ # Avoid allocating a bitmap if all values seen so far are valid
+ if self._valid_count == self._item_count:
+ return
+
+ # If the bitmap hasn't been allocated yet, allocate it now and
+ # fill with 1s for all previous elements.
+ cdef int code
+ if self._bitmap.size_bits == 0 and self._item_count > 1:
+ code = ArrowBitmapAppend(&self._bitmap, 1, self._item_count - 1)
+ if code != NANOARROW_OK:
+ Error.raise_error("ArrowBitmapAppend()", code)
+
+ # Append this element to the bitmap
+ code = ArrowBitmapAppend(&self._bitmap, is_valid, 1)
+ if code != NANOARROW_OK:
+ Error.raise_error("ArrowBitmapAppend()", code)
+
+ def __iter__(self):
+ for item in self._obj:
+ if item is None:
+ self._append_to_validity(0)
+ yield self._value_if_none
+ else:
+ self._append_to_validity(1)
+ yield item
+
+ def finish(self):
+ """Obtain the total count, null count, and validity bitmap after
+ consuming this iterable."""
+ null_count = self._item_count - self._valid_count
+
+ # If we did allocate a bitmap, make sure the last few bits are zeroed
+ if null_count > 0 and self._bitmap.size_bits % 8 != 0:
+ ArrowBitmapAppendUnsafe(&self._bitmap, 0, self._bitmap.size_bits %
8)
+
+ cdef CBuffer validity = CBuffer.empty()
+ ArrowBufferMove(&self._bitmap.buffer, validity._ptr)
+ validity._set_data_type(NANOARROW_TYPE_BOOL)
Review Comment:
This only needs to be done if there is actually an allocated validity bitmap
(if `null_count > 0`)?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]