This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 45f008a1ee pyarrow: Wrap capsule extraction errors into an explicit 
error (#9660)
45f008a1ee is described below

commit 45f008a1eee7b475d0ae8757241d43bc1389c252
Author: Thomas Tanon <[email protected]>
AuthorDate: Mon Jun 22 20:14:13 2026 +0200

    pyarrow: Wrap capsule extraction errors into an explicit error (#9660)
    
    Remove validate_pycapsule: pointer_checked already raises an exception
    if the capsule type is wrong, and we wrap it into a nicer error
    
    Also relies on a trait to associate types with their capsules names
    
    Because we wrap the exception (equivalent of `raise X from Y`), the
    original exception is still there
    
    Follow up of  #9639 #9594
---
 arrow-pyarrow-testing/Cargo.toml       |   1 +
 arrow-pyarrow-testing/tests/pyarrow.rs |  29 +++++
 arrow-pyarrow/src/lib.rs               | 210 +++++++++++++++------------------
 3 files changed, 128 insertions(+), 112 deletions(-)

diff --git a/arrow-pyarrow-testing/Cargo.toml b/arrow-pyarrow-testing/Cargo.toml
index d5542e25d6..c5a15b5cc9 100644
--- a/arrow-pyarrow-testing/Cargo.toml
+++ b/arrow-pyarrow-testing/Cargo.toml
@@ -47,5 +47,6 @@ publish = false
 [dependencies]
 # Note no dependency on arrow, to ensure arrow-pyarrow can be used by itself
 arrow-array = { path = "../arrow-array" }
+arrow-schema = { path = "../arrow-schema" }
 arrow-pyarrow = { path = "../arrow-pyarrow" }
 pyo3 = { version = "0.29.0", default-features = false }
diff --git a/arrow-pyarrow-testing/tests/pyarrow.rs 
b/arrow-pyarrow-testing/tests/pyarrow.rs
index c70536b1c3..1e4ad43817 100644
--- a/arrow-pyarrow-testing/tests/pyarrow.rs
+++ b/arrow-pyarrow-testing/tests/pyarrow.rs
@@ -41,6 +41,7 @@ use arrow_array::builder::{BinaryViewBuilder, 
StringViewBuilder};
 use arrow_array::{
     Array, ArrayRef, BinaryViewArray, Int32Array, RecordBatch, StringArray, 
StringViewArray,
 };
+use arrow_schema::Schema;
 use arrow_pyarrow::{FromPyArrow, ToPyArrow};
 use pyo3::exceptions::PyTypeError;
 use pyo3::types::{PyAnyMethods, PyModule};
@@ -141,6 +142,34 @@ value = NotATuple()
     });
 }
 
+#[test]
+fn test_from_pyarrow_non_capsule() {
+    Python::initialize();
+
+    Python::attach(|py| {
+        let code = CString::new(
+            r#"
+class NotACapsule:
+    def __arrow_c_schema__(self):
+        return 1
+
+value = NotACapsule()
+"#,
+        )
+            .unwrap();
+
+        let module = PyModule::from_code(py, code.as_c_str(), c"test.py", 
c"test_module").unwrap();
+        let value = module.getattr("value").unwrap();
+
+        let err = Schema::from_pyarrow_bound(&value).unwrap_err();
+        assert!(err.is_instance_of::<PyTypeError>(py));
+        assert_eq!(
+            err.to_string(),
+            "TypeError: Expected __arrow_c_schema__ to return a capsule."
+        );
+    });
+}
+
 fn binary_view_column(num_variadic_buffers: usize) -> BinaryViewArray {
     let long_scalar = b"but soft what light through yonder window 
breaks".as_slice();
     let mut builder = 
BinaryViewBuilder::new().with_fixed_block_size(long_scalar.len() as u32);
diff --git a/arrow-pyarrow/src/lib.rs b/arrow-pyarrow/src/lib.rs
index 07063b37cf..976d5ebb89 100644
--- a/arrow-pyarrow/src/lib.rs
+++ b/arrow-pyarrow/src/lib.rs
@@ -61,6 +61,7 @@
 
 use std::convert::{From, TryFrom};
 use std::ffi::CStr;
+use std::ptr::NonNull;
 use std::sync::Arc;
 
 use arrow_array::ffi;
@@ -74,19 +75,15 @@ use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::ffi::Py_uintptr_t;
-use pyo3::import_exception;
 use pyo3::prelude::*;
 use pyo3::sync::PyOnceLock;
-use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
+use pyo3::types::{PyCapsule, PyDict, PyList, PyType};
+use pyo3::{CastError, import_exception};
 
 import_exception!(pyarrow, ArrowException);
 /// Represents an exception raised by PyArrow.
 pub type PyArrowException = ArrowException;
 
-const ARROW_ARRAY_STREAM_CAPSULE_NAME: &CStr = c"arrow_array_stream";
-const ARROW_SCHEMA_CAPSULE_NAME: &CStr = c"arrow_schema";
-const ARROW_ARRAY_CAPSULE_NAME: &CStr = c"arrow_array";
-
 fn to_py_err(err: ArrowError) -> PyErr {
     PyArrowException::new_err(err.to_string())
 }
@@ -131,56 +128,17 @@ fn validate_class(expected: &Bound<PyType>, value: 
&Bound<PyAny>) -> PyResult<()
     Ok(())
 }
 
-fn validate_pycapsule(capsule: &Bound<PyCapsule>, name: &str) -> PyResult<()> {
-    let capsule_name = capsule.name()?;
-    if capsule_name.is_none() {
-        return Err(PyValueError::new_err(
-            "Expected schema PyCapsule to have name set.",
-        ));
-    }
-
-    let capsule_name = unsafe { capsule_name.unwrap().as_cstr() }
-        .to_str()
-        .map_err(|e| PyValueError::new_err(e.to_string()))?;
-    if capsule_name != name {
-        return Err(PyValueError::new_err(format!(
-            "Expected name '{name}' in PyCapsule, instead got 
'{capsule_name}'",
-        )));
-    }
-
-    Ok(())
-}
-
-fn extract_arrow_c_array_capsules<'py>(
-    value: &Bound<'py, PyAny>,
-) -> PyResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> {
-    let tuple = value.call_method0("__arrow_c_array__")?;
-
-    if !tuple.is_instance_of::<PyTuple>() {
-        return Err(PyTypeError::new_err(
-            "Expected __arrow_c_array__ to return a tuple of (schema, array) 
capsules.",
-        ));
-    }
-
-    tuple.extract().map_err(|_| {
-        PyTypeError::new_err(
-            "Expected __arrow_c_array__ to return a tuple of (schema, array) 
capsules.",
-        )
-    })
-}
-
 impl FromPyArrow for DataType {
     fn from_pyarrow_bound(value: &Bound<PyAny>) -> PyResult<Self> {
         // Newer versions of PyArrow as well as other libraries with Arrow 
data implement this
         // method, so prefer it over _export_to_c.
         // See 
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
-        if value.hasattr("__arrow_c_schema__")? {
-            let capsule = value.call_method0("__arrow_c_schema__")?.extract()?;
-            validate_pycapsule(&capsule, "arrow_schema")?;
-
-            let schema_ptr = capsule
-                .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
-                .cast::<FFI_ArrowSchema>();
+        if let Some(capsule) = call_capsule_method_if_exists(value, 
"__arrow_c_schema__")? {
+            let schema_ptr = extract_capsule::<FFI_ArrowSchema>(
+                &capsule,
+                c"arrow_schema",
+                "__arrow_c_schema__",
+            )?;
             return unsafe { DataType::try_from(schema_ptr.as_ref()) 
}.map_err(to_py_err);
         }
 
@@ -204,13 +162,12 @@ impl FromPyArrow for Field {
         // Newer versions of PyArrow as well as other libraries with Arrow 
data implement this
         // method, so prefer it over _export_to_c.
         // See 
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
-        if value.hasattr("__arrow_c_schema__")? {
-            let capsule = value.call_method0("__arrow_c_schema__")?.extract()?;
-            validate_pycapsule(&capsule, "arrow_schema")?;
-
-            let schema_ptr = capsule
-                .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
-                .cast::<FFI_ArrowSchema>();
+        if let Some(capsule) = call_capsule_method_if_exists(value, 
"__arrow_c_schema__")? {
+            let schema_ptr = extract_capsule::<FFI_ArrowSchema>(
+                &capsule,
+                c"arrow_schema",
+                "__arrow_c_schema__",
+            )?;
             return unsafe { Field::try_from(schema_ptr.as_ref()) 
}.map_err(to_py_err);
         }
 
@@ -234,13 +191,12 @@ impl FromPyArrow for Schema {
         // Newer versions of PyArrow as well as other libraries with Arrow 
data implement this
         // method, so prefer it over _export_to_c.
         // See 
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
-        if value.hasattr("__arrow_c_schema__")? {
-            let capsule = value.call_method0("__arrow_c_schema__")?.extract()?;
-            validate_pycapsule(&capsule, "arrow_schema")?;
-
-            let schema_ptr = capsule
-                .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
-                .cast::<FFI_ArrowSchema>();
+        if let Some(capsule) = call_capsule_method_if_exists(value, 
"__arrow_c_schema__")? {
+            let schema_ptr = extract_capsule::<FFI_ArrowSchema>(
+                &capsule,
+                c"arrow_schema",
+                "__arrow_c_schema__",
+            )?;
             return unsafe { Schema::try_from(schema_ptr.as_ref()) 
}.map_err(to_py_err);
         }
 
@@ -264,23 +220,11 @@ impl FromPyArrow for ArrayData {
         // Newer versions of PyArrow as well as other libraries with Arrow 
data implement this
         // method, so prefer it over _export_to_c.
         // See 
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
-        if value.hasattr("__arrow_c_array__")? {
-            let (schema_capsule, array_capsule) = 
extract_arrow_c_array_capsules(value)?;
-
-            validate_pycapsule(&schema_capsule, "arrow_schema")?;
-            validate_pycapsule(&array_capsule, "arrow_array")?;
-
-            let schema_ptr = schema_capsule
-                .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
-                .cast::<FFI_ArrowSchema>();
-            let array = unsafe {
-                FFI_ArrowArray::from_raw(
-                    array_capsule
-                        .pointer_checked(Some(ARROW_ARRAY_CAPSULE_NAME))?
-                        .cast::<FFI_ArrowArray>()
-                        .as_ptr(),
-                )
-            };
+        if let Some((schema_capsule, array_capsule)) = 
call_arrow_c_array_method_if_exists(value)? {
+            let schema_ptr =
+                extract_capsule(&schema_capsule, c"arrow_schema", 
"__arrow_c_array__")?;
+            let array_ptr = extract_capsule(&array_capsule, c"arrow_array", 
"__arrow_c_array__")?;
+            let array = unsafe { FFI_ArrowArray::from_raw(array_ptr.as_ptr()) 
};
             return unsafe { ffi::from_ffi(array, schema_ptr.as_ref()) 
}.map_err(to_py_err);
         }
 
@@ -344,25 +288,18 @@ impl FromPyArrow for RecordBatch {
         // method, so prefer it over _export_to_c.
         // See 
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
 
-        if value.hasattr("__arrow_c_array__")? {
-            let (schema_capsule, array_capsule) = 
extract_arrow_c_array_capsules(value)?;
-
-            validate_pycapsule(&schema_capsule, "arrow_schema")?;
-            validate_pycapsule(&array_capsule, "arrow_array")?;
-
-            let schema_ptr = schema_capsule
-                .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
-                .cast::<FFI_ArrowSchema>();
-            let array_ptr = array_capsule
-                .pointer_checked(Some(ARROW_ARRAY_CAPSULE_NAME))?
-                .cast::<FFI_ArrowArray>();
+        if let Some((schema_capsule, array_capsule)) = 
call_arrow_c_array_method_if_exists(value)? {
+            let schema_ptr =
+                extract_capsule(&schema_capsule, c"arrow_schema", 
"__arrow_c_array__")?;
+            let array_ptr = extract_capsule(&array_capsule, c"arrow_array", 
"__arrow_c_array__")?;
             let ffi_array = unsafe { 
FFI_ArrowArray::from_raw(array_ptr.as_ptr()) };
             let array_data =
                 unsafe { ffi::from_ffi(ffi_array, schema_ptr.as_ref()) 
}.map_err(to_py_err)?;
             if !matches!(array_data.data_type(), DataType::Struct(_)) {
-                return Err(PyTypeError::new_err(
-                    "Expected Struct type from __arrow_c_array.",
-                ));
+                return Err(PyTypeError::new_err(format!(
+                    "Expected Struct type from __arrow_c_array__, found {}.",
+                    array_data.data_type()
+                )));
             }
             let options = 
RecordBatchOptions::default().with_row_count(Some(array_data.len()));
             let array = StructArray::from(array_data);
@@ -419,19 +356,10 @@ impl FromPyArrow for ArrowArrayStreamReader {
         // Newer versions of PyArrow as well as other libraries with Arrow 
data implement this
         // method, so prefer it over _export_to_c.
         // See 
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
-        if value.hasattr("__arrow_c_stream__")? {
-            let capsule = value.call_method0("__arrow_c_stream__")?.extract()?;
-
-            validate_pycapsule(&capsule, "arrow_array_stream")?;
-
-            let stream = unsafe {
-                FFI_ArrowArrayStream::from_raw(
-                    capsule
-                        
.pointer_checked(Some(ARROW_ARRAY_STREAM_CAPSULE_NAME))?
-                        .cast::<FFI_ArrowArrayStream>()
-                        .as_ptr(),
-                )
-            };
+        if let Some(capsule) = call_capsule_method_if_exists(value, 
"__arrow_c_stream__")? {
+            let stream_ptr =
+                extract_capsule(&capsule, c"arrow_array_stream", 
"__arrow_c_stream__")?;
+            let stream = unsafe { 
FFI_ArrowArrayStream::from_raw(stream_ptr.as_ptr()) };
 
             let stream_reader = ArrowArrayStreamReader::try_new(stream)
                 .map_err(|err| PyValueError::new_err(err.to_string()))?;
@@ -447,8 +375,7 @@ impl FromPyArrow for ArrowArrayStreamReader {
         // make the conversion through PyArrow's private API
         // this changes the pointer's memory and is thus unsafe.
         // In particular, `_export_to_c` can go out of bounds
-        let args = PyTuple::new(value.py(), [&raw mut stream as 
Py_uintptr_t])?;
-        value.call_method1("_export_to_c", args)?;
+        value.call_method1("_export_to_c", (&raw mut stream as 
Py_uintptr_t,))?;
 
         ArrowArrayStreamReader::try_new(stream)
             .map_err(|err| PyValueError::new_err(err.to_string()))
@@ -630,3 +557,62 @@ impl<T> From<T> for PyArrowType<T> {
         Self(s)
     }
 }
+
+fn call_capsule_method_if_exists<'py>(
+    object: &Bound<'py, PyAny>,
+    method_name: &'static str,
+) -> PyResult<Option<Bound<'py, PyCapsule>>> {
+    let Some(method) = object.getattr_opt(method_name)? else {
+        return Ok(None);
+    };
+    Ok(Some(method.call0()?.extract().map_err(
+        |e: CastError| {
+            wrapping_type_error(
+                object.py(),
+                e.into(),
+                format!("Expected {method_name} to return a capsule."),
+            )
+        },
+    )?))
+}
+
+fn call_arrow_c_array_method_if_exists<'py>(
+    object: &Bound<'py, PyAny>,
+) -> PyResult<Option<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)>> {
+    let Some(method) = object.getattr_opt("__arrow_c_array__")? else {
+        return Ok(None);
+    };
+    Ok(Some(method.call0()?.extract().map_err(|e| {
+        wrapping_type_error(
+            object.py(),
+            e,
+            "Expected __arrow_c_array__ to return a tuple of (schema, array) 
capsules.".into(),
+        )
+    })?))
+}
+
+fn extract_capsule<T>(
+    capsule: &Bound<PyCapsule>,
+    capsule_name: &CStr,
+    method_name: &'static str,
+) -> PyResult<NonNull<T>> {
+    Ok(capsule
+        .pointer_checked(Some(capsule_name))
+        .map_err(|e| {
+            wrapping_type_error(
+                capsule.py(),
+                e,
+                format!(
+                    "Expected {method_name} to return a {} capsule.",
+                    capsule_name.to_str().unwrap(),
+                ),
+            )
+        })?
+        .cast::<T>())
+}
+
+fn wrapping_type_error(py: Python<'_>, error: PyErr, message: String) -> PyErr 
{
+    let e = PyTypeError::new_err(message);
+    e.set_cause(py, Some(error));
+    e
+}

Reply via email to