This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b2783d96 feat(python): Implement bitmap unpacking (#450)
b2783d96 is described below

commit b2783d9603022709f0d7507b51b2b1887de53861
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu May 2 14:01:03 2024 -0300

    feat(python): Implement bitmap unpacking (#450)
    
    In prototyping a real-world use case, I remembered that unpacking bits
    is exceedingly difficult to get right if you need to support an
    arbitrary offset/length. The math for this is very fiddly and we spent a
    few rounds getting it right in the C function
    `ArrowBitsUnpackInt(8|32)`. This PR makes that available so that we can
    do things like (1) convert bool arrays to numpy and (2) convert null
    masks to something that somebody else can work with (e.g., a numpy
    mask).
    
    This seems to be relatively performant (thanks to WillAyd's work
    optimizing this!)
    
    ```python
    import numpy as np
    import nanoarrow as na
    import pyarrow as pa
    
    bool_np = np.random.random(int(1e6)) > 0.5
    bool_na = na.Array(iter(bool_array), na.bool_())
    bool_pa = pa.array(bool_np)
    
    def to_numpy_na(x):
        x_view = na.c_array(x).view()
        out = np.empty(x_view.length, bool)
        x_view.buffer(1).unpack_bits_into(out, x_view.offset, x_view.length)
        return out
    
    %timeit to_numpy_na(bool_na)
    #> 162 µs ± 812 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
    
    %timeit bool_pa.to_numpy(False)
    #> 609 µs ± 833 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
    ```
---
 python/src/nanoarrow/_lib.pyx      | 46 ++++++++++++++++++++++++++++++++++
 python/tests/test_c_buffer_view.py | 51 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/python/src/nanoarrow/_lib.pyx b/python/src/nanoarrow/_lib.pyx
index 04602120..2f193308 100644
--- a/python/src/nanoarrow/_lib.pyx
+++ b/python/src/nanoarrow/_lib.pyx
@@ -1815,6 +1815,52 @@ cdef class CBufferView:
         else:
             return self._iter_dispatch(offset, length)
 
+    def unpack_bits_into(self, dest, offset=0, length=None, dest_offset=0):
+        if self._data_type != NANOARROW_TYPE_BOOL:
+            raise ValueError("Can't unpack non-boolean buffer")
+
+        if length is None:
+            length = self.n_elements
+
+        if offset < 0 or length < 0 or (offset + length) > self.n_elements:
+            raise IndexError(
+                f"offset {offset} and length {length} do not describe a valid 
slice "
+                f"of buffer with {self.n_elements} elements"
+            )
+
+        cdef Py_buffer buffer
+        PyObject_GetBuffer(dest, &buffer, PyBUF_WRITABLE | 
PyBUF_ANY_CONTIGUOUS)
+        if buffer.itemsize != 1:
+            PyBuffer_Release(&buffer)
+            raise ValueError("Destination buffer has itemsize != 1")
+
+        if dest_offset < 0 or buffer.len < (dest_offset + length):
+            buffer_len = buffer.len
+            PyBuffer_Release(&buffer)
+            raise IndexError(
+                f"Can't unpack {length} elements into buffer of size 
{buffer_len} "
+                f"with dest_offset = {dest_offset}"
+            )
+
+        ArrowBitsUnpackInt8(
+            self._ptr.data.as_uint8,
+            offset,
+            length,
+            &(<int8_t*>buffer.buf)[dest_offset]
+        )
+
+        PyBuffer_Release(&buffer)
+
+    def unpack_bits(self, offset=0, length=None):
+        if length is None:
+            length = self.n_elements
+
+        out = CBufferBuilder().set_data_type(NANOARROW_TYPE_UINT8)
+        out.reserve_bytes(length)
+        self.unpack_bits_into(out, offset, length)
+        out.advance(length)
+        return out.finish()
+
     def _iter_bitmap(self, int64_t offset, int64_t length):
         cdef uint8_t item
         cdef int64_t i
diff --git a/python/tests/test_c_buffer_view.py 
b/python/tests/test_c_buffer_view.py
index 6ef00d9e..6e1a4142 100644
--- a/python/tests/test_c_buffer_view.py
+++ b/python/tests/test_c_buffer_view.py
@@ -62,6 +62,53 @@ def test_buffer_view_bool_():
     assert "10010000" in repr(view)
 
 
+def test_buffer_view_bool_unpack():
+    from array import array
+
+    bool_array_view = na.c_array([1, 0, 0, 1], na.bool_()).view()
+    view = bool_array_view.buffer(1)
+
+    # Check unpacking
+    unpacked_all = view.unpack_bits()
+    assert len(unpacked_all) == view.n_elements
+    assert unpacked_all.data_type == "uint8"
+    assert list(unpacked_all) == [1, 0, 0, 1, 0, 0, 0, 0]
+
+    unpacked_some = view.unpack_bits(1, 4)
+    assert len(unpacked_some) == 4
+    assert list(unpacked_some) == [0, 0, 1, 0]
+
+    # Check with non-zero destination offset
+    out = bytearray([255] * 10)
+    view.unpack_bits_into(out, dest_offset=2)
+    assert list(out) == [255, 255, 1, 0, 0, 1, 0, 0, 0, 0]
+
+    # Check error requesting out-of-bounds dest_offset
+    with pytest.raises(IndexError, match="Can't unpack"):
+        view.unpack_bits_into(out, dest_offset=-1)
+
+    # Check errors from requesting out-of-bounds slices
+    msg = "do not describe a valid slice"
+    with pytest.raises(IndexError, match=msg):
+        view.unpack_bits(-1, None)
+    with pytest.raises(IndexError, match=msg):
+        view.unpack_bits(0, -1)
+    with pytest.raises(IndexError, match=msg):
+        view.unpack_bits(0, 9)
+
+    # Check errors from an output buffer of insufficient length
+    out = bytearray()
+    msg = "Can't unpack 8 elements into buffer of size 0"
+    with pytest.raises(IndexError, match=msg):
+        view.unpack_bits_into(out)
+
+    # Check errors from an output buffer with the wrong data type
+    out = array("i", [0, 0, 0, 0])
+    msg = "Destination buffer has itemsize != 1"
+    with pytest.raises(ValueError, match=msg):
+        view.unpack_bits_into(out)
+
+
 def test_buffer_view_non_bool():
     array_view = na.c_array([1, 2, 3, 5], na.int32()).view()
     view = array_view.buffer(1)
@@ -99,5 +146,9 @@ def test_buffer_view_non_bool():
     with pytest.raises(IndexError, match="do not describe a valid slice"):
         view.elements(1, 4)
 
+    # Check that unpacking will error
+    with pytest.raises(ValueError, match="Can't unpack non-boolean buffer"):
+        view.unpack_bits()
+
     # Check repr
     assert "1 2 3 5" in repr(view)

Reply via email to