This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 45f008a1ee pyarrow: Wrap capsule extraction errors into an explicit
error (#9660)
45f008a1ee is described below
commit 45f008a1eee7b475d0ae8757241d43bc1389c252
Author: Thomas Tanon <[email protected]>
AuthorDate: Mon Jun 22 20:14:13 2026 +0200
pyarrow: Wrap capsule extraction errors into an explicit error (#9660)
Remove validate_pycapsule: pointer_checked already raises an exception
if the capsule type is wrong, and we wrap it into a nicer error
Also relies on a trait to associate types with their capsules names
Because we wrap the exception (equivalent of `raise X from Y`), the
original exception is still there
Follow up of #9639 #9594
---
arrow-pyarrow-testing/Cargo.toml | 1 +
arrow-pyarrow-testing/tests/pyarrow.rs | 29 +++++
arrow-pyarrow/src/lib.rs | 210 +++++++++++++++------------------
3 files changed, 128 insertions(+), 112 deletions(-)
diff --git a/arrow-pyarrow-testing/Cargo.toml b/arrow-pyarrow-testing/Cargo.toml
index d5542e25d6..c5a15b5cc9 100644
--- a/arrow-pyarrow-testing/Cargo.toml
+++ b/arrow-pyarrow-testing/Cargo.toml
@@ -47,5 +47,6 @@ publish = false
[dependencies]
# Note no dependency on arrow, to ensure arrow-pyarrow can be used by itself
arrow-array = { path = "../arrow-array" }
+arrow-schema = { path = "../arrow-schema" }
arrow-pyarrow = { path = "../arrow-pyarrow" }
pyo3 = { version = "0.29.0", default-features = false }
diff --git a/arrow-pyarrow-testing/tests/pyarrow.rs
b/arrow-pyarrow-testing/tests/pyarrow.rs
index c70536b1c3..1e4ad43817 100644
--- a/arrow-pyarrow-testing/tests/pyarrow.rs
+++ b/arrow-pyarrow-testing/tests/pyarrow.rs
@@ -41,6 +41,7 @@ use arrow_array::builder::{BinaryViewBuilder,
StringViewBuilder};
use arrow_array::{
Array, ArrayRef, BinaryViewArray, Int32Array, RecordBatch, StringArray,
StringViewArray,
};
+use arrow_schema::Schema;
use arrow_pyarrow::{FromPyArrow, ToPyArrow};
use pyo3::exceptions::PyTypeError;
use pyo3::types::{PyAnyMethods, PyModule};
@@ -141,6 +142,34 @@ value = NotATuple()
});
}
+#[test]
+fn test_from_pyarrow_non_capsule() {
+ Python::initialize();
+
+ Python::attach(|py| {
+ let code = CString::new(
+ r#"
+class NotACapsule:
+ def __arrow_c_schema__(self):
+ return 1
+
+value = NotACapsule()
+"#,
+ )
+ .unwrap();
+
+ let module = PyModule::from_code(py, code.as_c_str(), c"test.py",
c"test_module").unwrap();
+ let value = module.getattr("value").unwrap();
+
+ let err = Schema::from_pyarrow_bound(&value).unwrap_err();
+ assert!(err.is_instance_of::<PyTypeError>(py));
+ assert_eq!(
+ err.to_string(),
+ "TypeError: Expected __arrow_c_schema__ to return a capsule."
+ );
+ });
+}
+
fn binary_view_column(num_variadic_buffers: usize) -> BinaryViewArray {
let long_scalar = b"but soft what light through yonder window
breaks".as_slice();
let mut builder =
BinaryViewBuilder::new().with_fixed_block_size(long_scalar.len() as u32);
diff --git a/arrow-pyarrow/src/lib.rs b/arrow-pyarrow/src/lib.rs
index 07063b37cf..976d5ebb89 100644
--- a/arrow-pyarrow/src/lib.rs
+++ b/arrow-pyarrow/src/lib.rs
@@ -61,6 +61,7 @@
use std::convert::{From, TryFrom};
use std::ffi::CStr;
+use std::ptr::NonNull;
use std::sync::Arc;
use arrow_array::ffi;
@@ -74,19 +75,15 @@ use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
use pyo3::exceptions::{PyTypeError, PyValueError};
use pyo3::ffi::Py_uintptr_t;
-use pyo3::import_exception;
use pyo3::prelude::*;
use pyo3::sync::PyOnceLock;
-use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType};
+use pyo3::types::{PyCapsule, PyDict, PyList, PyType};
+use pyo3::{CastError, import_exception};
import_exception!(pyarrow, ArrowException);
/// Represents an exception raised by PyArrow.
pub type PyArrowException = ArrowException;
-const ARROW_ARRAY_STREAM_CAPSULE_NAME: &CStr = c"arrow_array_stream";
-const ARROW_SCHEMA_CAPSULE_NAME: &CStr = c"arrow_schema";
-const ARROW_ARRAY_CAPSULE_NAME: &CStr = c"arrow_array";
-
fn to_py_err(err: ArrowError) -> PyErr {
PyArrowException::new_err(err.to_string())
}
@@ -131,56 +128,17 @@ fn validate_class(expected: &Bound<PyType>, value:
&Bound<PyAny>) -> PyResult<()
Ok(())
}
-fn validate_pycapsule(capsule: &Bound<PyCapsule>, name: &str) -> PyResult<()> {
- let capsule_name = capsule.name()?;
- if capsule_name.is_none() {
- return Err(PyValueError::new_err(
- "Expected schema PyCapsule to have name set.",
- ));
- }
-
- let capsule_name = unsafe { capsule_name.unwrap().as_cstr() }
- .to_str()
- .map_err(|e| PyValueError::new_err(e.to_string()))?;
- if capsule_name != name {
- return Err(PyValueError::new_err(format!(
- "Expected name '{name}' in PyCapsule, instead got
'{capsule_name}'",
- )));
- }
-
- Ok(())
-}
-
-fn extract_arrow_c_array_capsules<'py>(
- value: &Bound<'py, PyAny>,
-) -> PyResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> {
- let tuple = value.call_method0("__arrow_c_array__")?;
-
- if !tuple.is_instance_of::<PyTuple>() {
- return Err(PyTypeError::new_err(
- "Expected __arrow_c_array__ to return a tuple of (schema, array)
capsules.",
- ));
- }
-
- tuple.extract().map_err(|_| {
- PyTypeError::new_err(
- "Expected __arrow_c_array__ to return a tuple of (schema, array)
capsules.",
- )
- })
-}
-
impl FromPyArrow for DataType {
fn from_pyarrow_bound(value: &Bound<PyAny>) -> PyResult<Self> {
// Newer versions of PyArrow as well as other libraries with Arrow
data implement this
// method, so prefer it over _export_to_c.
// See
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
- if value.hasattr("__arrow_c_schema__")? {
- let capsule = value.call_method0("__arrow_c_schema__")?.extract()?;
- validate_pycapsule(&capsule, "arrow_schema")?;
-
- let schema_ptr = capsule
- .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
- .cast::<FFI_ArrowSchema>();
+ if let Some(capsule) = call_capsule_method_if_exists(value,
"__arrow_c_schema__")? {
+ let schema_ptr = extract_capsule::<FFI_ArrowSchema>(
+ &capsule,
+ c"arrow_schema",
+ "__arrow_c_schema__",
+ )?;
return unsafe { DataType::try_from(schema_ptr.as_ref())
}.map_err(to_py_err);
}
@@ -204,13 +162,12 @@ impl FromPyArrow for Field {
// Newer versions of PyArrow as well as other libraries with Arrow
data implement this
// method, so prefer it over _export_to_c.
// See
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
- if value.hasattr("__arrow_c_schema__")? {
- let capsule = value.call_method0("__arrow_c_schema__")?.extract()?;
- validate_pycapsule(&capsule, "arrow_schema")?;
-
- let schema_ptr = capsule
- .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
- .cast::<FFI_ArrowSchema>();
+ if let Some(capsule) = call_capsule_method_if_exists(value,
"__arrow_c_schema__")? {
+ let schema_ptr = extract_capsule::<FFI_ArrowSchema>(
+ &capsule,
+ c"arrow_schema",
+ "__arrow_c_schema__",
+ )?;
return unsafe { Field::try_from(schema_ptr.as_ref())
}.map_err(to_py_err);
}
@@ -234,13 +191,12 @@ impl FromPyArrow for Schema {
// Newer versions of PyArrow as well as other libraries with Arrow
data implement this
// method, so prefer it over _export_to_c.
// See
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
- if value.hasattr("__arrow_c_schema__")? {
- let capsule = value.call_method0("__arrow_c_schema__")?.extract()?;
- validate_pycapsule(&capsule, "arrow_schema")?;
-
- let schema_ptr = capsule
- .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
- .cast::<FFI_ArrowSchema>();
+ if let Some(capsule) = call_capsule_method_if_exists(value,
"__arrow_c_schema__")? {
+ let schema_ptr = extract_capsule::<FFI_ArrowSchema>(
+ &capsule,
+ c"arrow_schema",
+ "__arrow_c_schema__",
+ )?;
return unsafe { Schema::try_from(schema_ptr.as_ref())
}.map_err(to_py_err);
}
@@ -264,23 +220,11 @@ impl FromPyArrow for ArrayData {
// Newer versions of PyArrow as well as other libraries with Arrow
data implement this
// method, so prefer it over _export_to_c.
// See
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
- if value.hasattr("__arrow_c_array__")? {
- let (schema_capsule, array_capsule) =
extract_arrow_c_array_capsules(value)?;
-
- validate_pycapsule(&schema_capsule, "arrow_schema")?;
- validate_pycapsule(&array_capsule, "arrow_array")?;
-
- let schema_ptr = schema_capsule
- .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
- .cast::<FFI_ArrowSchema>();
- let array = unsafe {
- FFI_ArrowArray::from_raw(
- array_capsule
- .pointer_checked(Some(ARROW_ARRAY_CAPSULE_NAME))?
- .cast::<FFI_ArrowArray>()
- .as_ptr(),
- )
- };
+ if let Some((schema_capsule, array_capsule)) =
call_arrow_c_array_method_if_exists(value)? {
+ let schema_ptr =
+ extract_capsule(&schema_capsule, c"arrow_schema",
"__arrow_c_array__")?;
+ let array_ptr = extract_capsule(&array_capsule, c"arrow_array",
"__arrow_c_array__")?;
+ let array = unsafe { FFI_ArrowArray::from_raw(array_ptr.as_ptr())
};
return unsafe { ffi::from_ffi(array, schema_ptr.as_ref())
}.map_err(to_py_err);
}
@@ -344,25 +288,18 @@ impl FromPyArrow for RecordBatch {
// method, so prefer it over _export_to_c.
// See
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
- if value.hasattr("__arrow_c_array__")? {
- let (schema_capsule, array_capsule) =
extract_arrow_c_array_capsules(value)?;
-
- validate_pycapsule(&schema_capsule, "arrow_schema")?;
- validate_pycapsule(&array_capsule, "arrow_array")?;
-
- let schema_ptr = schema_capsule
- .pointer_checked(Some(ARROW_SCHEMA_CAPSULE_NAME))?
- .cast::<FFI_ArrowSchema>();
- let array_ptr = array_capsule
- .pointer_checked(Some(ARROW_ARRAY_CAPSULE_NAME))?
- .cast::<FFI_ArrowArray>();
+ if let Some((schema_capsule, array_capsule)) =
call_arrow_c_array_method_if_exists(value)? {
+ let schema_ptr =
+ extract_capsule(&schema_capsule, c"arrow_schema",
"__arrow_c_array__")?;
+ let array_ptr = extract_capsule(&array_capsule, c"arrow_array",
"__arrow_c_array__")?;
let ffi_array = unsafe {
FFI_ArrowArray::from_raw(array_ptr.as_ptr()) };
let array_data =
unsafe { ffi::from_ffi(ffi_array, schema_ptr.as_ref())
}.map_err(to_py_err)?;
if !matches!(array_data.data_type(), DataType::Struct(_)) {
- return Err(PyTypeError::new_err(
- "Expected Struct type from __arrow_c_array.",
- ));
+ return Err(PyTypeError::new_err(format!(
+ "Expected Struct type from __arrow_c_array__, found {}.",
+ array_data.data_type()
+ )));
}
let options =
RecordBatchOptions::default().with_row_count(Some(array_data.len()));
let array = StructArray::from(array_data);
@@ -419,19 +356,10 @@ impl FromPyArrow for ArrowArrayStreamReader {
// Newer versions of PyArrow as well as other libraries with Arrow
data implement this
// method, so prefer it over _export_to_c.
// See
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
- if value.hasattr("__arrow_c_stream__")? {
- let capsule = value.call_method0("__arrow_c_stream__")?.extract()?;
-
- validate_pycapsule(&capsule, "arrow_array_stream")?;
-
- let stream = unsafe {
- FFI_ArrowArrayStream::from_raw(
- capsule
-
.pointer_checked(Some(ARROW_ARRAY_STREAM_CAPSULE_NAME))?
- .cast::<FFI_ArrowArrayStream>()
- .as_ptr(),
- )
- };
+ if let Some(capsule) = call_capsule_method_if_exists(value,
"__arrow_c_stream__")? {
+ let stream_ptr =
+ extract_capsule(&capsule, c"arrow_array_stream",
"__arrow_c_stream__")?;
+ let stream = unsafe {
FFI_ArrowArrayStream::from_raw(stream_ptr.as_ptr()) };
let stream_reader = ArrowArrayStreamReader::try_new(stream)
.map_err(|err| PyValueError::new_err(err.to_string()))?;
@@ -447,8 +375,7 @@ impl FromPyArrow for ArrowArrayStreamReader {
// make the conversion through PyArrow's private API
// this changes the pointer's memory and is thus unsafe.
// In particular, `_export_to_c` can go out of bounds
- let args = PyTuple::new(value.py(), [&raw mut stream as
Py_uintptr_t])?;
- value.call_method1("_export_to_c", args)?;
+ value.call_method1("_export_to_c", (&raw mut stream as
Py_uintptr_t,))?;
ArrowArrayStreamReader::try_new(stream)
.map_err(|err| PyValueError::new_err(err.to_string()))
@@ -630,3 +557,62 @@ impl<T> From<T> for PyArrowType<T> {
Self(s)
}
}
+
+fn call_capsule_method_if_exists<'py>(
+ object: &Bound<'py, PyAny>,
+ method_name: &'static str,
+) -> PyResult<Option<Bound<'py, PyCapsule>>> {
+ let Some(method) = object.getattr_opt(method_name)? else {
+ return Ok(None);
+ };
+ Ok(Some(method.call0()?.extract().map_err(
+ |e: CastError| {
+ wrapping_type_error(
+ object.py(),
+ e.into(),
+ format!("Expected {method_name} to return a capsule."),
+ )
+ },
+ )?))
+}
+
+fn call_arrow_c_array_method_if_exists<'py>(
+ object: &Bound<'py, PyAny>,
+) -> PyResult<Option<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)>> {
+ let Some(method) = object.getattr_opt("__arrow_c_array__")? else {
+ return Ok(None);
+ };
+ Ok(Some(method.call0()?.extract().map_err(|e| {
+ wrapping_type_error(
+ object.py(),
+ e,
+ "Expected __arrow_c_array__ to return a tuple of (schema, array)
capsules.".into(),
+ )
+ })?))
+}
+
+fn extract_capsule<T>(
+ capsule: &Bound<PyCapsule>,
+ capsule_name: &CStr,
+ method_name: &'static str,
+) -> PyResult<NonNull<T>> {
+ Ok(capsule
+ .pointer_checked(Some(capsule_name))
+ .map_err(|e| {
+ wrapping_type_error(
+ capsule.py(),
+ e,
+ format!(
+ "Expected {method_name} to return a {} capsule.",
+ capsule_name.to_str().unwrap(),
+ ),
+ )
+ })?
+ .cast::<T>())
+}
+
+fn wrapping_type_error(py: Python<'_>, error: PyErr, message: String) -> PyErr
{
+ let e = PyTypeError::new_err(message);
+ e.set_cause(py, Some(error));
+ e
+}