This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f43cf0d614 GH-41662: [Python] Ensure Buffer methods don't crash with 
non-CPU data (#41889)
f43cf0d614 is described below

commit f43cf0d6146068b8a1a1f61be248dc7fa6fc6591
Author: Alenka Frim <[email protected]>
AuthorDate: Thu Jun 13 17:25:20 2024 +0200

    GH-41662: [Python] Ensure Buffer methods don't crash with non-CPU data 
(#41889)
    
    ### Rationale for this change
    
    `hex()` and `__getitem__` currently segfault if the data is not located on 
the CPU.
    
    ### What changes are included in this PR?
    
    This PR adds a check and returns `NotImplementedError` if the data is not 
on CPU.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #41662
    
    Lead-authored-by: AlenkaF <[email protected]>
    Co-authored-by: Alenka Frim <[email protected]>
    Co-authored-by: Joris Van den Bossche <[email protected]>
    Signed-off-by: Joris Van den Bossche <[email protected]>
---
 python/pyarrow/io.pxi           | 24 ++++++++++++++
 python/pyarrow/tests/test_io.py | 70 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+)

diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 48b7934209..3a0125e957 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1286,6 +1286,10 @@ cdef class Buffer(_Weakrefable):
                 f"is_cpu={self.is_cpu} "
                 f"is_mutable={self.is_mutable}>")
 
+    def _assert_cpu(self):
+        if not self.is_cpu:
+            raise NotImplementedError("Implemented only for data on CPU 
device")
+
     @property
     def size(self):
         """
@@ -1311,6 +1315,7 @@ cdef class Buffer(_Weakrefable):
         -------
         : bytes
         """
+        self._assert_cpu()
         return self.buffer.get().ToHexString()
 
     @property
@@ -1378,6 +1383,7 @@ cdef class Buffer(_Weakrefable):
         return self.getitem(_normalize_index(key, self.size))
 
     cdef getitem(self, int64_t i):
+        self._assert_cpu()
         return self.buffer.get().data()[i]
 
     def slice(self, offset=0, length=None):
@@ -1424,6 +1430,18 @@ cdef class Buffer(_Weakrefable):
         are_equal : bool
             True if buffer contents and size are equal
         """
+        if self.device != other.device:
+            raise ValueError(
+                "Device on which the data resides differs between buffers: "
+                f"{self.device.type_name} and {other.device.type_name}."
+            )
+        if not self.is_cpu:
+            if self.address != other.address:
+                raise NotImplementedError(
+                    "Implemented only for data on CPU device or data with 
equal "
+                    "addresses"
+                )
+
         cdef c_bool result = False
         with nogil:
             result = self.buffer.get().Equals(deref(other.buffer.get()))
@@ -1436,6 +1454,8 @@ cdef class Buffer(_Weakrefable):
             return self.equals(py_buffer(other))
 
     def __reduce_ex__(self, protocol):
+        self._assert_cpu()
+
         if protocol >= 5:
             bufobj = pickle.PickleBuffer(self)
         elif self.buffer.get().is_mutable():
@@ -1452,11 +1472,15 @@ cdef class Buffer(_Weakrefable):
         """
         Return this buffer as a Python bytes object. Memory is copied.
         """
+        self._assert_cpu()
+
         return cp.PyBytes_FromStringAndSize(
             <const char*>self.buffer.get().data(),
             self.buffer.get().size())
 
     def __getbuffer__(self, cp.Py_buffer* buffer, int flags):
+        self._assert_cpu()
+
         if self.buffer.get().is_mutable():
             buffer.readonly = 0
         else:
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 17eab871a2..2306014c41 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -669,6 +669,76 @@ def test_allocate_buffer_resizable():
     assert buf.size == 200
 
 
+def test_non_cpu_buffer(pickle_module):
+    cuda = pytest.importorskip("pyarrow.cuda")
+    ctx = cuda.Context(0)
+
+    data = np.array([b'testing'])
+    cuda_buf = ctx.buffer_from_data(data)
+    arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None, 
cuda_buf])
+    buf_on_gpu = arr.buffers()[1]
+
+    assert buf_on_gpu.size == cuda_buf.size
+    assert buf_on_gpu.address == cuda_buf.address
+    assert buf_on_gpu.is_cpu == cuda_buf.is_cpu
+    assert buf_on_gpu.is_mutable
+
+    repr1 = "<pyarrow.Buffer address="
+    repr2 = "size=7 is_cpu=False is_mutable=True>"
+    assert repr1 in repr(buf_on_gpu)
+    assert repr2 in repr(buf_on_gpu)
+
+    buf_on_gpu_sliced = buf_on_gpu.slice(2)
+    cuda_sliced = cuda.CudaBuffer.from_buffer(buf_on_gpu_sliced)
+    assert cuda_sliced.to_pybytes() == b'sting'
+
+    buf_on_gpu_sliced = buf_on_gpu[2:4]
+    cuda_sliced = cuda.CudaBuffer.from_buffer(buf_on_gpu_sliced)
+    assert cuda_sliced.to_pybytes() == b'st'
+
+    # Sliced buffers with same address
+    assert buf_on_gpu_sliced.equals(cuda_buf[2:4])
+
+    # Buffers on different devices
+    msg_device = "Device on which the data resides differs between buffers"
+    with pytest.raises(ValueError, match=msg_device):
+        buf_on_gpu.equals(pa.py_buffer(data))
+
+    msg = "Implemented only for data on CPU device"
+    # Buffers with different addresses
+    arr_short = np.array([b'sting'])
+    cuda_buf_short = ctx.buffer_from_data(arr_short)
+    with pytest.raises(NotImplementedError, match=msg):
+        buf_on_gpu_sliced.equals(cuda_buf_short)
+    arr_short = pa.FixedSizeBinaryArray.from_buffers(
+        pa.binary(5), 1, [None, cuda_buf_short]
+    )
+    buf_on_gpu_short = arr_short.buffers()[1]
+    with pytest.raises(NotImplementedError, match=msg):
+        buf_on_gpu_sliced.equals(buf_on_gpu_short)
+
+    with pytest.raises(NotImplementedError, match=msg):
+        buf_on_gpu.hex()
+
+    with pytest.raises(NotImplementedError, match=msg):
+        cuda_buf.hex()
+
+    with pytest.raises(NotImplementedError, match=msg):
+        buf_on_gpu[1]
+
+    with pytest.raises(NotImplementedError, match=msg):
+        buf_on_gpu.to_pybytes()
+
+    with pytest.raises(NotImplementedError, match=msg):
+        pickle_module.dumps(buf_on_gpu, protocol=4)
+
+    with pytest.raises(NotImplementedError, match=msg):
+        pickle_module.dumps(cuda_buf, protocol=4)
+
+    with pytest.raises(NotImplementedError, match=msg):
+        memoryview(buf_on_gpu)
+
+
 def test_cache_options():
     opts1 = pa.CacheOptions()
     opts2 = pa.CacheOptions(hole_size_limit=1024)

Reply via email to