Re: [PR] Upgrade to DataFusion 53 [datafusion-python]

via GitHub Sun, 01 Mar 2026 10:00:11 -0800


nuno-faria commented on code in PR #1402:
URL: 
https://github.com/apache/datafusion-python/pull/1402#discussion_r2869546529



##########
examples/datafusion-ffi-example/src/aggregate_udf.rs:
##########
@@ -27,7 +27,12 @@ use datafusion_functions_aggregate::sum::Sum;
 use pyo3::types::PyCapsule;
 use pyo3::{Bound, PyResult, Python, pyclass, pymethods};
 
-#[pyclass(name = "MySumUDF", module = "datafusion_ffi_example", subclass)]
+#[pyclass(
+    from_py_object,

Review Comment:
   
https://pyo3.rs/v0.28.2/migration.html#deprecation-of-automatic-frompyobject-for-pyclass-types-which-implement-clone



##########
examples/datafusion-ffi-example/src/catalog_provider.rs:
##########
@@ -60,6 +60,7 @@ pub fn my_table() -> Arc<dyn TableProvider + 'static> {
 }
 
 #[pyclass(
+    skip_from_py_object,

Review Comment:
   These ones do not implement `Clone`.



##########
docs/source/user-guide/upgrade-guides.rst:
##########
@@ -18,6 +18,24 @@
 Upgrade Guides
 ==============
 
+DataFusion 53.0.0
+-----------------
+
+This version includes an upgraded version of `pyo3`, which changed the way to 
extract an FFI object.
+Example:
+
+Before:
+.. code-block:: rust
+        let codec = unsafe { capsule.reference::<FFI_LogicalExtensionCodec>() 
};
+
+Now:
+.. code-block:: rust
+        let data: NonNull<FFI_LogicalExtensionCodec> = capsule
+            
.pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))?
+            .cast();
+        let codec = unsafe { data.as_ref() };

Review Comment:
   https://github.com/PyO3/pyo3/pull/5474



##########
python/tests/test_functions.py:
##########
@@ -964,15 +970,15 @@ def test_temporal_functions(df):
             datetime(2027, 6, 1, tzinfo=DEFAULT_TZ),
             datetime(2020, 7, 1, tzinfo=DEFAULT_TZ),
         ],
-        type=pa.timestamp("ns", tz=DEFAULT_TZ),
+        type=pa.timestamp("us", tz=DEFAULT_TZ),

Review Comment:
   Don't know exactly what caused this, but now it returns microseconds instead 
of nanoseconds.



##########
python/tests/test_expr.py:
##########
@@ -857,20 +857,20 @@ def test_alias_with_metadata(df):
         pytest.param(
             col("d").list_distinct(),
             pa.array(
-                [[-1, 0, 1], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
+                [[-1, 1, 0], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
             ),
             id="list_distinct",
         ),
         pytest.param(
             col("d").array_distinct(),
             pa.array(
-                [[-1, 0, 1], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
+                [[-1, 1, 0], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())

Review Comment:
   Order has changed for `array_distinct/list_distinct`: 
https://github.com/apache/datafusion/pull/20364 



##########
python/tests/test_context.py:
##########
@@ -759,7 +759,7 @@ def read_csv_with_options_inner(
     group_dir.mkdir(exist_ok=True)
 
     csv_path = group_dir / "test.csv"
-    csv_path.write_text(csv_content)
+    csv_path.write_text(csv_content, newline="\n")

Review Comment:
   Not related to the upgrade, but fails on Windows systems.



##########
examples/datafusion-ffi-example/src/utils.rs:
##########
@@ -31,10 +33,13 @@ pub(crate) fn ffi_logical_codec_from_pycapsule(
         obj
     };
 
-    let capsule = capsule.downcast::<PyCapsule>()?;
+    let capsule = capsule.cast::<PyCapsule>()?;
     validate_pycapsule(capsule, "datafusion_logical_extension_codec")?;
 
-    let codec = unsafe { capsule.reference::<FFI_LogicalExtensionCodec>() };
+    let data: NonNull<FFI_LogicalExtensionCodec> = capsule
+        .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))?
+        .cast();
+    let codec = unsafe { data.as_ref() };

Review Comment:
   https://github.com/PyO3/pyo3/pull/5474



##########
python/tests/test_functions.py:
##########
@@ -726,7 +729,10 @@ def test_array_function_obj_tests(stmt, py_expr):
             pa.array(["He??o", "Wor?d", "!"]),
         ),
         (f.reverse(column("a")), pa.array(["olleH", "dlroW", "!"])),
-        (f.right(column("a"), literal(4)), pa.array(["ello", "orld", "!"])),
+        (
+            f.right(column("a"), literal(4)),

Review Comment:
   https://github.com/apache/datafusion/pull/20069



##########
python/tests/test_dataframe.py:
##########
@@ -295,10 +295,17 @@ def test_drop_quoted_columns():
     ctx = SessionContext()
     batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], 
names=["ID_For_Students"])
     df = ctx.create_dataframe([[batch]])
-
-    # Both should work
+    # here we must quote to match the original column name
     assert df.drop('"ID_For_Students"').schema().names == []
-    assert df.drop("ID_For_Students").schema().names == []
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"]
+    )
+    df = ctx.create_dataframe([[batch]])
+    # with a lower case column, both 'a' and '"a"' work
+    assert df.drop("a").schema().names == ["b"]
+    df = ctx.create_dataframe([[batch]])
+    assert df.drop('"a"').schema().names == ["b"]

Review Comment:
   https://github.com/apache/datafusion/pull/19549



##########
python/tests/test_aggregation.py:
##########
@@ -141,14 +141,14 @@ def test_aggregation_stats(df, agg_expr, calc_expected):
         ),
         (
             f.approx_percentile_cont_with_weight(column("b"), lit(0.6), 0.5),
-            pa.array([6], type=pa.float64()),
+            pa.array([4], type=pa.float64()),
             False,
         ),
         (
             f.approx_percentile_cont_with_weight(
                 column("b").sort(ascending=False, nulls_first=False), 
lit(0.6), 0.5
             ),
-            pa.array([6], type=pa.float64()),
+            pa.array([4], type=pa.float64()),

Review Comment:
   https://github.com/apache/datafusion/pull/19941



##########
examples/datafusion-ffi-example/src/utils.rs:
##########
@@ -31,10 +33,13 @@ pub(crate) fn ffi_logical_codec_from_pycapsule(
         obj
     };
 
-    let capsule = capsule.downcast::<PyCapsule>()?;
+    let capsule = capsule.cast::<PyCapsule>()?;

Review Comment:
   
https://pyo3.rs/main/migration#downcast-and-downcasterror-replaced-with-cast-and-casterror



##########
python/tests/test_functions.py:
##########
@@ -697,7 +697,10 @@ def test_array_function_obj_tests(stmt, py_expr):
             f.initcap(column("c")),
             pa.array(["Hello ", " World ", " !"], type=pa.string_view()),
         ),
-        (f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])),
+        (
+            f.left(column("a"), literal(3)),
+            pa.array(["Hel", "Wor", "!"], type=pa.string_view()),

Review Comment:
   https://github.com/apache/datafusion/pull/19980



##########
src/dataset_exec.rs:
##########
@@ -71,7 +71,7 @@ pub(crate) struct DatasetExec {
     columns: Option<Vec<String>>,
     filter_expr: Option<Py<PyAny>>,
     projected_statistics: Statistics,
-    plan_properties: datafusion::physical_plan::PlanProperties,
+    plan_properties: Arc<PlanProperties>,

Review Comment:
   https://github.com/apache/datafusion/pull/19792



##########
python/datafusion/dataframe.py:
##########
@@ -441,30 +441,20 @@ def select(self, *exprs: Expr | str) -> DataFrame:
     def drop(self, *columns: str) -> DataFrame:
         """Drop arbitrary amount of columns.
 
-        Column names are case-sensitive and do not require double quotes like
-        other operations such as `select`. Leading and trailing double quotes
-        are allowed and will be automatically stripped if present.
+        Column names are case-sensitive and require double quotes to be dropped
+        if the original name is not strictly lower case.
 
         Args:
-            columns: Column names to drop from the dataframe. Both 
``column_name``
-                    and ``"column_name"`` are accepted.
+            columns: Column names to drop from the dataframe.
 
         Returns:
             DataFrame with those columns removed in the projection.
 
         Example Usage::
-
-            df.drop('ID_For_Students')      # Works
-            df.drop('"ID_For_Students"')    # Also works (quotes stripped)
+            df.drop('a')                    # To drop a lower-cased column 'a'
+            df.drop('"a"')                  # To drop an upper-cased column 'A'
         """
-        normalized_columns = []
-        for col in columns:
-            if col.startswith('"') and col.endswith('"'):
-                normalized_columns.append(col.strip('"'))  # Strip double 
quotes
-            else:
-                normalized_columns.append(col)
-
-        return DataFrame(self.df.drop(*normalized_columns))
+        return DataFrame(self.df.drop(*columns))
 

Review Comment:
   The `drop` method used to remove the quotes from the names before passing to 
Rust. However, they are now explicitly needed if the column to remove contains 
uppercase letters.



##########
src/context.rs:
##########
@@ -815,7 +847,7 @@ impl PySessionContext {
             .to_str()
             .ok_or_else(|| PyValueError::new_err("Unable to convert path to a 
string"))?;
 
-        let mut options = NdJsonReadOptions::default()
+        let mut options = JsonReadOptions::default()

Review Comment:
   https://github.com/apache/datafusion/pull/19924



##########
src/expr/set_comparison.rs:
##########
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   New expression: https://github.com/apache/datafusion/pull/19109



##########
python/tests/test_expr.py:
##########
@@ -857,20 +857,20 @@ def test_alias_with_metadata(df):
         pytest.param(
             col("d").list_distinct(),
             pa.array(
-                [[-1, 0, 1], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
+                [[-1, 1, 0], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
             ),
             id="list_distinct",
         ),
         pytest.param(
             col("d").array_distinct(),
             pa.array(
-                [[-1, 0, 1], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
+                [[-1, 1, 0], [5, 10, 15, 20], [], None], 
type=pa.list_(pa.int64())
             ),
             id="array_distinct",
         ),
         pytest.param(
             col("d").cardinality(),
-            pa.array([3, 4, None, None], type=pa.uint64()),
+            pa.array([3, 4, 0, None], type=pa.uint64()),

Review Comment:
   `cardinality` now returns 0 for empty arrays: 
https://github.com/apache/datafusion/pull/20533



##########
src/pyarrow_util.rs:
##########
@@ -142,9 +142,11 @@ impl FromPyArrow for PyScalarValue {
     }
 }
 
-impl<'source> FromPyObject<'source> for PyScalarValue {
-    fn extract_bound(value: &Bound<'source, PyAny>) -> PyResult<Self> {
-        Self::from_pyarrow_bound(value)
+impl<'source> FromPyObject<'_, 'source> for PyScalarValue {
+    type Error = PyErr;
+
+    fn extract(value: Borrowed<'_, 'source, PyAny>) -> Result<Self, 
Self::Error> {
+        Self::from_pyarrow_bound(&value)

Review Comment:
   
https://pyo3.rs/v0.28.2/migration.html#frompyobject-reworked-for-flexibility-and-efficiency



##########
src/sql/logical.rs:
##########
@@ -203,7 +210,7 @@ impl PyLogicalPlan {
         ctx: PySessionContext,
         proto_msg: Bound<'_, PyBytes>,
     ) -> PyDataFusionResult<Self> {
-        let bytes: &[u8] = proto_msg.extract()?;
+        let bytes: &[u8] = proto_msg.extract().map_err(Into::<PyErr>::into)?;

Review Comment:
   Need to convert the error now.



##########
src/dataset_exec.rs:
##########
@@ -235,11 +235,11 @@ impl ExecutionPlan for DatasetExec {
         })
     }
 
-    fn statistics(&self) -> DFResult<Statistics> {
+    fn partition_statistics(&self, _partition: Option<usize>) -> 
DFResult<Statistics> {

Review Comment:
   `statistics` has been removed: 
https://github.com/apache/datafusion/pull/20319



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Upgrade to DataFusion 53 [datafusion-python]

Reply via email to