[
https://issues.apache.org/jira/browse/ARROW-2068?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16366027#comment-16366027
]
ASF GitHub Bot commented on ARROW-2068:
---------------------------------------
xhochy closed pull request #1613: ARROW-2068: [Python] Expose array's buffers
URL: https://github.com/apache/arrow/pull/1613
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f85363cb1..a43bfb93b 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -244,6 +244,22 @@ cdef wrap_datum(const CDatum& datum):
raise ValueError("Unable to wrap Datum in a Python object")
+cdef _append_array_buffers(const CArrayData* ad, list res):
+ """
+ Recursively append Buffer wrappers from *ad* and its children.
+ """
+ cdef size_t i, n
+ assert ad != NULL
+ n = ad.buffers.size()
+ for i in range(n):
+ buf = ad.buffers[i]
+ res.append(pyarrow_wrap_buffer(buf)
+ if buf.get() != NULL else None)
+ n = ad.child_data.size()
+ for i in range(n):
+ _append_array_buffers(ad.child_data[i].get(), res)
+
+
cdef class Array:
cdef void init(self, const shared_ptr[CArray]& sp_array):
@@ -463,6 +479,15 @@ cdef class Array:
with nogil:
check_status(ValidateArray(deref(self.ap)))
+ def buffers(self):
+ """
+ Return a list of Buffer objects pointing to this array's physical
+ storage.
+ """
+ res = []
+ _append_array_buffers(self.sp_array.get().data().get(), res)
+ return res
+
cdef class Tensor:
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index efbcef5e1..197dac0d8 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -17,6 +17,7 @@
import datetime
import pytest
+import struct
import sys
import numpy as np
@@ -589,3 +590,66 @@ def test_array_from_numpy_unicode():
arrow_arr = pa.array(arr)
expected = pa.array(['', '', ''], type='utf8')
assert arrow_arr.equals(expected)
+
+
+def test_buffers_primitive():
+ a = pa.array([1, 2, None, 4], type=pa.int16())
+ buffers = a.buffers()
+ assert len(buffers) == 2
+ null_bitmap = buffers[0].to_pybytes()
+ assert 1 <= len(null_bitmap) <= 64 # XXX this is varying
+ assert bytearray(null_bitmap)[0] == 0b00001011
+
+ assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4)
+
+ a = pa.array(np.int8([4, 5, 6]))
+ buffers = a.buffers()
+ assert len(buffers) == 2
+ # No null bitmap from Numpy int array
+ assert buffers[0] is None
+ assert struct.unpack('3b', buffers[1].to_pybytes()) == (4, 5, 6)
+
+ a = pa.array([b'foo!', None, b'bar!!'])
+ buffers = a.buffers()
+ assert len(buffers) == 3
+ null_bitmap = buffers[0].to_pybytes()
+ assert bytearray(null_bitmap)[0] == 0b00000101
+ offsets = buffers[1].to_pybytes()
+ assert struct.unpack('4i', offsets) == (0, 4, 4, 9)
+ values = buffers[2].to_pybytes()
+ assert values == b'foo!bar!!'
+
+
+def test_buffers_nested():
+ a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
+ buffers = a.buffers()
+ assert len(buffers) == 4
+ # The parent buffers
+ null_bitmap = buffers[0].to_pybytes()
+ assert bytearray(null_bitmap)[0] == 0b00000101
+ offsets = buffers[1].to_pybytes()
+ assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
+ # The child buffers
+ null_bitmap = buffers[2].to_pybytes()
+ assert bytearray(null_bitmap)[0] == 0b00110111
+ values = buffers[3].to_pybytes()
+ assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)
+
+ a = pa.array([(42, None), None, (None, 43)],
+ type=pa.struct([pa.field('a', pa.int8()),
+ pa.field('b', pa.int16())]))
+ buffers = a.buffers()
+ assert len(buffers) == 5
+ # The parent buffer
+ null_bitmap = buffers[0].to_pybytes()
+ assert bytearray(null_bitmap)[0] == 0b00000101
+ # The child buffers: 'a'
+ null_bitmap = buffers[1].to_pybytes()
+ assert bytearray(null_bitmap)[0] == 0b00000001
+ values = buffers[2].to_pybytes()
+ assert struct.unpack('bxx', values) == (42,)
+ # The child buffers: 'b'
+ null_bitmap = buffers[3].to_pybytes()
+ assert bytearray(null_bitmap)[0] == 0b00000100
+ values = buffers[4].to_pybytes()
+ assert struct.unpack('4xh', values) == (43,)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] Expose Array's buffers to Python users
> -----------------------------------------------
>
> Key: ARROW-2068
> URL: https://issues.apache.org/jira/browse/ARROW-2068
> Project: Apache Arrow
> Issue Type: Improvement
> Components: Python
> Reporter: Wes McKinney
> Assignee: Antoine Pitrou
> Priority: Major
> Labels: pull-request-available
>
> This amounts to converting {{arr->data()->buffers}} to a list of
> {{pyarrow.Buffer}} objects
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)