This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f43cf0d614 GH-41662: [Python] Ensure Buffer methods don't crash with
non-CPU data (#41889)
f43cf0d614 is described below
commit f43cf0d6146068b8a1a1f61be248dc7fa6fc6591
Author: Alenka Frim <[email protected]>
AuthorDate: Thu Jun 13 17:25:20 2024 +0200
GH-41662: [Python] Ensure Buffer methods don't crash with non-CPU data
(#41889)
### Rationale for this change
`hex()` and `__getitem__` currently segfault if the data is not located on
the CPU.
### What changes are included in this PR?
This PR adds a check and returns `NotImplementedError` if the data is not
on CPU.
### Are these changes tested?
Yes
### Are there any user-facing changes?
No.
* GitHub Issue: #41662
Lead-authored-by: AlenkaF <[email protected]>
Co-authored-by: Alenka Frim <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/io.pxi | 24 ++++++++++++++
python/pyarrow/tests/test_io.py | 70 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 94 insertions(+)
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 48b7934209..3a0125e957 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1286,6 +1286,10 @@ cdef class Buffer(_Weakrefable):
f"is_cpu={self.is_cpu} "
f"is_mutable={self.is_mutable}>")
+ def _assert_cpu(self):
+ if not self.is_cpu:
+ raise NotImplementedError("Implemented only for data on CPU
device")
+
@property
def size(self):
"""
@@ -1311,6 +1315,7 @@ cdef class Buffer(_Weakrefable):
-------
: bytes
"""
+ self._assert_cpu()
return self.buffer.get().ToHexString()
@property
@@ -1378,6 +1383,7 @@ cdef class Buffer(_Weakrefable):
return self.getitem(_normalize_index(key, self.size))
cdef getitem(self, int64_t i):
+ self._assert_cpu()
return self.buffer.get().data()[i]
def slice(self, offset=0, length=None):
@@ -1424,6 +1430,18 @@ cdef class Buffer(_Weakrefable):
are_equal : bool
True if buffer contents and size are equal
"""
+ if self.device != other.device:
+ raise ValueError(
+ "Device on which the data resides differs between buffers: "
+ f"{self.device.type_name} and {other.device.type_name}."
+ )
+ if not self.is_cpu:
+ if self.address != other.address:
+ raise NotImplementedError(
+ "Implemented only for data on CPU device or data with
equal "
+ "addresses"
+ )
+
cdef c_bool result = False
with nogil:
result = self.buffer.get().Equals(deref(other.buffer.get()))
@@ -1436,6 +1454,8 @@ cdef class Buffer(_Weakrefable):
return self.equals(py_buffer(other))
def __reduce_ex__(self, protocol):
+ self._assert_cpu()
+
if protocol >= 5:
bufobj = pickle.PickleBuffer(self)
elif self.buffer.get().is_mutable():
@@ -1452,11 +1472,15 @@ cdef class Buffer(_Weakrefable):
"""
Return this buffer as a Python bytes object. Memory is copied.
"""
+ self._assert_cpu()
+
return cp.PyBytes_FromStringAndSize(
<const char*>self.buffer.get().data(),
self.buffer.get().size())
def __getbuffer__(self, cp.Py_buffer* buffer, int flags):
+ self._assert_cpu()
+
if self.buffer.get().is_mutable():
buffer.readonly = 0
else:
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 17eab871a2..2306014c41 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -669,6 +669,76 @@ def test_allocate_buffer_resizable():
assert buf.size == 200
+def test_non_cpu_buffer(pickle_module):
+ cuda = pytest.importorskip("pyarrow.cuda")
+ ctx = cuda.Context(0)
+
+ data = np.array([b'testing'])
+ cuda_buf = ctx.buffer_from_data(data)
+ arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None,
cuda_buf])
+ buf_on_gpu = arr.buffers()[1]
+
+ assert buf_on_gpu.size == cuda_buf.size
+ assert buf_on_gpu.address == cuda_buf.address
+ assert buf_on_gpu.is_cpu == cuda_buf.is_cpu
+ assert buf_on_gpu.is_mutable
+
+ repr1 = "<pyarrow.Buffer address="
+ repr2 = "size=7 is_cpu=False is_mutable=True>"
+ assert repr1 in repr(buf_on_gpu)
+ assert repr2 in repr(buf_on_gpu)
+
+ buf_on_gpu_sliced = buf_on_gpu.slice(2)
+ cuda_sliced = cuda.CudaBuffer.from_buffer(buf_on_gpu_sliced)
+ assert cuda_sliced.to_pybytes() == b'sting'
+
+ buf_on_gpu_sliced = buf_on_gpu[2:4]
+ cuda_sliced = cuda.CudaBuffer.from_buffer(buf_on_gpu_sliced)
+ assert cuda_sliced.to_pybytes() == b'st'
+
+ # Sliced buffers with same address
+ assert buf_on_gpu_sliced.equals(cuda_buf[2:4])
+
+ # Buffers on different devices
+ msg_device = "Device on which the data resides differs between buffers"
+ with pytest.raises(ValueError, match=msg_device):
+ buf_on_gpu.equals(pa.py_buffer(data))
+
+ msg = "Implemented only for data on CPU device"
+ # Buffers with different addresses
+ arr_short = np.array([b'sting'])
+ cuda_buf_short = ctx.buffer_from_data(arr_short)
+ with pytest.raises(NotImplementedError, match=msg):
+ buf_on_gpu_sliced.equals(cuda_buf_short)
+ arr_short = pa.FixedSizeBinaryArray.from_buffers(
+ pa.binary(5), 1, [None, cuda_buf_short]
+ )
+ buf_on_gpu_short = arr_short.buffers()[1]
+ with pytest.raises(NotImplementedError, match=msg):
+ buf_on_gpu_sliced.equals(buf_on_gpu_short)
+
+ with pytest.raises(NotImplementedError, match=msg):
+ buf_on_gpu.hex()
+
+ with pytest.raises(NotImplementedError, match=msg):
+ cuda_buf.hex()
+
+ with pytest.raises(NotImplementedError, match=msg):
+ buf_on_gpu[1]
+
+ with pytest.raises(NotImplementedError, match=msg):
+ buf_on_gpu.to_pybytes()
+
+ with pytest.raises(NotImplementedError, match=msg):
+ pickle_module.dumps(buf_on_gpu, protocol=4)
+
+ with pytest.raises(NotImplementedError, match=msg):
+ pickle_module.dumps(cuda_buf, protocol=4)
+
+ with pytest.raises(NotImplementedError, match=msg):
+ memoryview(buf_on_gpu)
+
+
def test_cache_options():
opts1 = pa.CacheOptions()
opts2 = pa.CacheOptions(hole_size_limit=1024)