jiayuasu commented on code in PR #807: URL: https://github.com/apache/sedona-db/pull/807#discussion_r3193290203
########## python/sedonadb/src/expr.rs: ########## @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Python-facing expression wrapper. +//! +//! This module is the Rust half of `sedonadb.expr.Expr`. It exposes a thin +//! PyO3 class (`PyExpr`, exported to Python as `_lib.InternalExpr`) that owns +//! a single `datafusion_expr::Expr` and a small set of factory `#[pyfunction]`s +//! used by the Python wrapper to construct columns and literals. +//! +//! The high-level shape: +//! +//! - The Python side (`sedonadb.expr.Expr`) holds a handle to a `PyExpr` and +//! provides operator overloading, docstrings, and Pythonic ergonomics. +//! - This Rust side stays minimal: build the right `Expr` variant, do the +//! minimum amount of validation that benefits from native types (e.g. +//! inspecting Arrow extension metadata for `cast`), and bubble up errors +//! via `PySedonaError`. +//! +//! The design mirrors the equivalent layer in the R bindings +//! (`r/sedonadb/src/rust/src/expression.rs`), where the Python `Expr` plays the +//! role of `SedonaDBExpr` and `expr_col` / `expr_lit` mirror +//! `SedonaDBExprFactory::column` / `::literal`. + +use datafusion_common::Column; +use datafusion_expr::{expr::InList, Cast, Expr}; +use pyo3::prelude::*; + +use crate::error::PySedonaError; +use crate::import_from::{import_arrow_field, import_arrow_scalar}; + +/// PyO3 wrapper around a single DataFusion logical expression. +/// +/// `PyExpr` is exposed to Python as `_lib.InternalExpr` and is the value +/// type behind the user-facing `sedonadb.expr.Expr` Python class. It is +/// `Clone` so that PyO3 method signatures can take `Vec<PyExpr>` by value; +/// each clone is cheap because `Expr` itself owns its children behind `Box`, +/// and cloning copies only the small wrapper plus a few `Arc`s in the +/// underlying tree. +/// +/// The `inner` field is intentionally `pub` so that other PyO3 modules in +/// this crate (e.g. `dataframe.rs`) can take `Vec<PyExpr>` arguments and +/// move the inner `Expr` out without going through accessor methods. +#[pyclass(name = "InternalExpr")] +#[derive(Clone)] +pub struct PyExpr { + pub inner: Expr, +} + +impl PyExpr { + pub fn new(inner: Expr) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyExpr { + /// Pretty-print using DataFusion's `Display` impl (e.g. `"x AS y"`). + /// Used by Python's `repr()` / `str()` paths. + fn __repr__(&self) -> String { + format!("{}", self.inner) + } + + /// Debug-print the full Rust `Expr` tree. Useful for tests and + /// troubleshooting; not part of the user-facing surface. + fn debug_string(&self) -> String { + format!("{:?}", self.inner) + } + + /// Return the name of the `Expr` enum variant (e.g. `"Column"`, + /// `"Literal"`, `"BinaryExpr"`). This is a stable structural property + /// that tests can assert on without depending on Display formatting. + fn variant_name(&self) -> String { + self.inner.variant_name().to_string() + } + + /// Wrap this expression in `Expr::Alias { name }`. + /// + /// We use DataFusion's `alias_if_changed` helper so that aliasing an + /// expression to its existing name is a no-op. This avoids producing + /// e.g. `Expr::Alias("x", Expr::Column("x"))` which is redundant but + /// otherwise legal. + fn alias(&self, name: &str) -> Result<Self, PySedonaError> { + let inner = self.inner.clone().alias_if_changed(name.to_string())?; + Ok(Self { inner }) + } + + /// Wrap this expression in `Expr::Cast` to the storage type of the + /// provided Arrow field-like object. + /// + /// Steps: + /// + /// 1. Pull an `arrow_schema::Field` out of `target` via the Arrow + /// PyCapsule schema interface (any object exposing + /// `__arrow_c_schema__` works — `pyarrow.DataType`, + /// `pyarrow.Field`, etc.). + /// 2. Reject Arrow extension types up front. SedonaDB's spatial types + /// are extension types over WKB; users who reach for `cast` on + /// those almost certainly want a different operation, so we surface + /// a clear error rather than silently dropping the extension. Review Comment: Thanks for the heads-up. Leaving extension-type cast as a clear error in this PR. Happy to revisit with the DF53 optimizer rule once that's available — geometry/geography is the obvious payoff. ########## python/sedonadb/src/expr.rs: ########## @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Python-facing expression wrapper. +//! +//! This module is the Rust half of `sedonadb.expr.Expr`. It exposes a thin +//! PyO3 class (`PyExpr`, exported to Python as `_lib.InternalExpr`) that owns +//! a single `datafusion_expr::Expr` and a small set of factory `#[pyfunction]`s +//! used by the Python wrapper to construct columns and literals. +//! +//! The high-level shape: +//! +//! - The Python side (`sedonadb.expr.Expr`) holds a handle to a `PyExpr` and +//! provides operator overloading, docstrings, and Pythonic ergonomics. +//! - This Rust side stays minimal: build the right `Expr` variant, do the +//! minimum amount of validation that benefits from native types (e.g. +//! inspecting Arrow extension metadata for `cast`), and bubble up errors +//! via `PySedonaError`. +//! +//! The design mirrors the equivalent layer in the R bindings +//! (`r/sedonadb/src/rust/src/expression.rs`), where the Python `Expr` plays the +//! role of `SedonaDBExpr` and `expr_col` / `expr_lit` mirror +//! `SedonaDBExprFactory::column` / `::literal`. + +use datafusion_common::Column; +use datafusion_expr::{expr::InList, Cast, Expr}; +use pyo3::prelude::*; + +use crate::error::PySedonaError; +use crate::import_from::{import_arrow_field, import_arrow_scalar}; + +/// PyO3 wrapper around a single DataFusion logical expression. +/// +/// `PyExpr` is exposed to Python as `_lib.InternalExpr` and is the value +/// type behind the user-facing `sedonadb.expr.Expr` Python class. It is +/// `Clone` so that PyO3 method signatures can take `Vec<PyExpr>` by value; +/// each clone is cheap because `Expr` itself owns its children behind `Box`, +/// and cloning copies only the small wrapper plus a few `Arc`s in the +/// underlying tree. +/// +/// The `inner` field is intentionally `pub` so that other PyO3 modules in +/// this crate (e.g. `dataframe.rs`) can take `Vec<PyExpr>` arguments and +/// move the inner `Expr` out without going through accessor methods. +#[pyclass(name = "InternalExpr")] +#[derive(Clone)] +pub struct PyExpr { + pub inner: Expr, +} + +impl PyExpr { + pub fn new(inner: Expr) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyExpr { + /// Pretty-print using DataFusion's `Display` impl (e.g. `"x AS y"`). + /// Used by Python's `repr()` / `str()` paths. + fn __repr__(&self) -> String { + format!("{}", self.inner) + } + + /// Debug-print the full Rust `Expr` tree. Useful for tests and + /// troubleshooting; not part of the user-facing surface. + fn debug_string(&self) -> String { + format!("{:?}", self.inner) + } + + /// Return the name of the `Expr` enum variant (e.g. `"Column"`, + /// `"Literal"`, `"BinaryExpr"`). This is a stable structural property + /// that tests can assert on without depending on Display formatting. + fn variant_name(&self) -> String { + self.inner.variant_name().to_string() + } + + /// Wrap this expression in `Expr::Alias { name }`. + /// + /// We use DataFusion's `alias_if_changed` helper so that aliasing an + /// expression to its existing name is a no-op. This avoids producing + /// e.g. `Expr::Alias("x", Expr::Column("x"))` which is redundant but + /// otherwise legal. + fn alias(&self, name: &str) -> Result<Self, PySedonaError> { + let inner = self.inner.clone().alias_if_changed(name.to_string())?; + Ok(Self { inner }) + } + + /// Wrap this expression in `Expr::Cast` to the storage type of the + /// provided Arrow field-like object. + /// + /// Steps: + /// + /// 1. Pull an `arrow_schema::Field` out of `target` via the Arrow + /// PyCapsule schema interface (any object exposing + /// `__arrow_c_schema__` works — `pyarrow.DataType`, + /// `pyarrow.Field`, etc.). + /// 2. Reject Arrow extension types up front. SedonaDB's spatial types + /// are extension types over WKB; users who reach for `cast` on + /// those almost certainly want a different operation, so we surface + /// a clear error rather than silently dropping the extension. + /// 3. Build `Expr::Cast { expr, data_type }` using only the storage + /// type. We don't carry field metadata through the cast. + fn cast(&self, target: Bound<'_, PyAny>) -> Result<Self, PySedonaError> { + let field = import_arrow_field(&target)?; + if let Some(type_name) = field.extension_type_name() { + return Err(PySedonaError::SedonaPython(format!( + "Can't cast to Arrow extension type '{type_name}'" + ))); + } + let inner = Expr::Cast(Cast::new( + Box::new(self.inner.clone()), + field.data_type().clone(), + )); + Ok(Self { inner }) + } + + /// Build `Expr::IsNull(self)`. SQL semantics — matches NULL only, + /// not floating-point NaN. The Pythonic NaN-aware accessor will live + /// on the future `Series` type. + fn is_null(&self) -> Self { + Self { + inner: Expr::IsNull(Box::new(self.inner.clone())), + } + } + + /// Build `Expr::IsNotNull(self)`. SQL semantics, mirror of `is_null`. + fn is_not_null(&self) -> Self { + Self { + inner: Expr::IsNotNull(Box::new(self.inner.clone())), + } + } + + /// Build `Expr::InList(self IN (values), negated)`. + /// + /// The Python side already coerces every element of the user's list + /// to a `PyExpr` (using `lit()` for non-Expr scalars), so the Rust + /// side just needs to clone each `Expr` out of its handle and hand + /// the resulting `Vec<Expr>` to `InList::new`. `negated=true` + /// produces `NOT IN`, exposed through the optional kwarg below. + #[pyo3(signature = (values, negated=false))] + fn isin(&self, values: Vec<PyRef<'_, PyExpr>>, negated: bool) -> Self { + let list = values.iter().map(|e| e.inner.clone()).collect(); + Self { + inner: Expr::InList(InList::new(Box::new(self.inner.clone()), list, negated)), + } + } + + /// Build `Expr::Negative(self)` — arithmetic negation, the unary `-`. + fn negate(&self) -> Self { + Self { + inner: Expr::Negative(Box::new(self.inner.clone())), + } + } +} + +/// Construct an unqualified column reference: `Expr::Column("x")`. +/// +/// Qualified columns (e.g. `t.x`) are not exposed yet; the Python +/// `col()` helper takes only a single name. When we add joins and +/// multi-table references we can grow this to accept an optional +/// table qualifier, matching the R side's `column(name, qualifier)`. +#[pyfunction] +pub fn expr_col(name: &str) -> PyExpr { Review Comment: Done in 0d1a4abe — `expr_col(name, qualifier=None)` on the Rust side and `col(name, qualifier=None)` on the Python side. Test in `test_col_with_qualifier`. ########## python/sedonadb/tests/expr/test_expression.py: ########## @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# These tests assert structural properties of constructed expressions — +# primarily `variant_name()`, child variants reachable via `debug_string()`, +# and the presence of user-supplied identifiers (column names, literal +# values) inside the rendered representation. Where possible we avoid +# pinning exact substrings of DataFusion's `Display` formatting so the +# suite is not coupled to a specific DataFusion version. + +import pyarrow as pa +import pytest + +from sedonadb.expr import Expr, col, lit + + +def test_col_returns_expr(): + e = col("x") + assert isinstance(e, Expr) + assert e._impl.variant_name() == "Column" + assert "x" in repr(e) Review Comment: Reverted in 0d1a4abe — exact repr substrings are back (`x AS y`, `CAST(x AS Int32)`, etc.), with `variant_name()` checks kept alongside as structural anchors. Added a module-level comment explaining the policy so the next person doesn't re-loosen them. ########## python/sedonadb/python/sedonadb/expr/expression.py: ########## @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Iterable + +from sedonadb._lib import expr_col as _expr_col +from sedonadb._lib import expr_lit as _expr_lit +from sedonadb.expr.literal import Literal + + +class Expr: + """A column expression. + + `Expr` represents a logical expression that will be evaluated against a + `DataFrame` when the frame is executed. Expressions are pure syntax — they + do not carry data and are not bound to a particular frame at construction + time. Errors such as referring to a column that does not exist surface only + when the expression is consumed (for example, by `DataFrame.select()` or + `DataFrame.filter()`). + + Construct an `Expr` with `col(name)` or `lit(value)`. + """ + + __slots__ = ("_impl",) + + def __init__(self, impl): + # impl is the underlying _lib.InternalExpr handle. Users normally + # do not construct Expr directly; use col() / lit() instead. + self._impl = impl + + def __repr__(self) -> str: + return f"Expr({self._impl!r})" + + def alias(self, name: str) -> "Expr": + """Return a copy of the expression with a new output name.""" + return Expr(self._impl.alias(name)) + + def cast(self, target) -> "Expr": + """Cast the expression to the given Arrow type. + + `target` must be an object exposing the Arrow C schema interface + (e.g. `pyarrow.int64()`, `pyarrow.string()`, a `pyarrow.Field`, or any + object with `__arrow_c_schema__`). Casting to Arrow extension types is + not supported. + """ + return Expr(self._impl.cast(target)) + + def is_null(self) -> "Expr": + """Return a boolean expression that is true where this expression + is SQL NULL. + + Note that floating-point NaN is *not* matched by `is_null` — the + SQL `IS NULL` predicate only matches NULL. A pandas-style + NaN-aware helper is planned on the future `Series` type. + """ + return Expr(self._impl.is_null()) + + def is_not_null(self) -> "Expr": + """Return a boolean expression that is true where this expression is + not null.""" + return Expr(self._impl.is_not_null()) + + def isin(self, values: Iterable[Any]) -> "Expr": + """Return a boolean expression that is true where this expression + equals any of the given values.""" + coerced = [_to_expr(v) for v in values] + return Expr(self._impl.isin([e._impl for e in coerced], False)) + + def negate(self) -> "Expr": + """Return the arithmetic negation of this expression.""" + return Expr(self._impl.negate()) + + +def col(name: str) -> Expr: + """Reference a column by name. + + Examples: + >>> from sedonadb.expr import col + >>> col("x").alias("y") + Expr(...) + """ + return Expr(_expr_col(name)) + + +def lit(value: Any) -> Expr: + """Wrap a Python value as a literal expression. + + Accepts the same value types as `sedonadb.expr.literal.lit`, including + Python scalars, pyarrow arrays/scalars, and Shapely geometries. Returns an + `Expr` suitable for composition with column expressions. + """ Review Comment: Got it, that makes sense — thanks for the context. Folded into 0d1a4abe: dropped the public `lit() -> Expr` and moved the Python-value-to-Expr coercion into the private `_to_expr()` helper. The existing `sedonadb.expr.literal.lit` returning `Literal` is untouched and stays re-exported from the package. ########## python/sedonadb/python/sedonadb/expr/expression.py: ########## @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Iterable + +from sedonadb._lib import expr_col as _expr_col +from sedonadb._lib import expr_lit as _expr_lit +from sedonadb.expr.literal import Literal + + +class Expr: + """A column expression. + + `Expr` represents a logical expression that will be evaluated against a + `DataFrame` when the frame is executed. Expressions are pure syntax — they + do not carry data and are not bound to a particular frame at construction + time. Errors such as referring to a column that does not exist surface only + when the expression is consumed (for example, by `DataFrame.select()` or + `DataFrame.filter()`). + + Construct an `Expr` with `col(name)` or `lit(value)`. + """ + + __slots__ = ("_impl",) + + def __init__(self, impl): + # impl is the underlying _lib.InternalExpr handle. Users normally + # do not construct Expr directly; use col() / lit() instead. + self._impl = impl Review Comment: Added in 0d1a4abe — `Expr.__init__` now `isinstance`-checks its argument against `_lib.InternalExpr` and raises `TypeError` with a message pointing at `col()`. Covered by `test_expr_init_rejects_wrong_type`. ########## python/sedonadb/python/sedonadb/expr/expression.py: ########## @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Iterable + +from sedonadb._lib import expr_col as _expr_col +from sedonadb._lib import expr_lit as _expr_lit +from sedonadb.expr.literal import Literal + + +class Expr: + """A column expression. + + `Expr` represents a logical expression that will be evaluated against a + `DataFrame` when the frame is executed. Expressions are pure syntax — they + do not carry data and are not bound to a particular frame at construction + time. Errors such as referring to a column that does not exist surface only + when the expression is consumed (for example, by `DataFrame.select()` or + `DataFrame.filter()`). + + Construct an `Expr` with `col(name)` or `lit(value)`. + """ + + __slots__ = ("_impl",) + + def __init__(self, impl): + # impl is the underlying _lib.InternalExpr handle. Users normally + # do not construct Expr directly; use col() / lit() instead. + self._impl = impl + + def __repr__(self) -> str: + return f"Expr({self._impl!r})" + + def alias(self, name: str) -> "Expr": + """Return a copy of the expression with a new output name.""" + return Expr(self._impl.alias(name)) Review Comment: Beefed up in 0d1a4abe. Public `Expr` methods and `col()` now have `Args:` and `Examples:` docstring blocks matching the style of `dataframe.py:head/limit/count`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
