This is an automated email from the ASF dual-hosted git repository.
timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new 694a5d8d feat: Add SQL expression support for `with_columns` (#1286)
694a5d8d is described below
commit 694a5d8d8d6a7e44b92cc45deedf8b162eb1366d
Author: Marko Milenković <[email protected]>
AuthorDate: Tue Oct 28 12:13:07 2025 +0000
feat: Add SQL expression support for `with_columns` (#1286)
* add SQL expression support for `with_columns`
* fix ruff errors
* Update python/datafusion/dataframe.py
Co-authored-by: Hendrik Makait <[email protected]>
* Update python/datafusion/dataframe.py
Co-authored-by: Hendrik Makait <[email protected]>
* remove parentheses
* update example
* fix ident
---------
Co-authored-by: Hendrik Makait <[email protected]>
---
python/datafusion/dataframe.py | 42 +++++++++++++++++++++++++++++++++++-------
python/tests/test_dataframe.py | 38 +++++++++++++++++++++++++++++---------
2 files changed, 64 insertions(+), 16 deletions(-)
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index 645598b5..eed30f57 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -545,13 +545,14 @@ class DataFrame:
return DataFrame(self.df.with_column(name, ensure_expr(expr)))
def with_columns(
- self, *exprs: Expr | Iterable[Expr], **named_exprs: Expr
+ self, *exprs: Expr | str | Iterable[Expr | str], **named_exprs: Expr |
str
) -> DataFrame:
"""Add columns to the DataFrame.
- By passing expressions, iterables of expressions, or named expressions.
+ By passing expressions, iterables of expressions, string SQL
expressions,
+ or named expressions.
All expressions must be :class:`~datafusion.expr.Expr` objects created
via
- :func:`datafusion.col` or :func:`datafusion.lit`.
+ :func:`datafusion.col` or :func:`datafusion.lit`, or SQL expression
strings.
To pass named expressions use the form ``name=Expr``.
Example usage: The following will add 4 columns labeled ``a``, ``b``,
``c``,
@@ -564,17 +565,44 @@ class DataFrame:
d=lit(3)
)
+ Equivalent example using just SQL strings:
+
+ df = df.with_columns(
+ "x as a",
+ ["1 as b", "y as c"],
+ d="3"
+ )
+
Args:
- exprs: Either a single expression or an iterable of expressions to
add.
+ exprs: Either a single expression, an iterable of expressions to
add or
+ SQL expression strings.
named_exprs: Named expressions in the form of ``name=expr``
Returns:
DataFrame with the new columns added.
"""
- expressions = ensure_expr_list(exprs)
+ expressions = []
+ for expr in exprs:
+ if isinstance(expr, str):
+ expressions.append(self.parse_sql_expr(expr).expr)
+ elif isinstance(expr, Iterable) and not isinstance(
+ expr, (Expr, str, bytes, bytearray)
+ ):
+ expressions.extend(
+ [
+ self.parse_sql_expr(e).expr
+ if isinstance(e, str)
+ else ensure_expr(e)
+ for e in expr
+ ]
+ )
+ else:
+ expressions.append(ensure_expr(expr))
+
for alias, expr in named_exprs.items():
- ensure_expr(expr)
- expressions.append(expr.alias(alias).expr)
+ e = self.parse_sql_expr(expr) if isinstance(expr, str) else expr
+ ensure_expr(e)
+ expressions.append(e.alias(alias).expr)
return DataFrame(self.df.with_columns(expressions))
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index b2333382..c3a5253c 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -538,15 +538,35 @@ def test_with_columns(df):
assert result.column(6) == pa.array([5, 7, 9])
-def test_with_columns_invalid_expr(df):
- with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
- df.with_columns("a")
- with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
- df.with_columns(c="a")
- with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
- df.with_columns(["a"])
- with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)):
- df.with_columns(c=["a"])
+def test_with_columns_str(df):
+ df = df.with_columns(
+ "a + b as c",
+ "a + b as d",
+ [
+ "a + b as e",
+ "a + b as f",
+ ],
+ g="a + b",
+ )
+
+ # execute and collect the first (and only) batch
+ result = df.collect()[0]
+
+ assert result.schema.field(0).name == "a"
+ assert result.schema.field(1).name == "b"
+ assert result.schema.field(2).name == "c"
+ assert result.schema.field(3).name == "d"
+ assert result.schema.field(4).name == "e"
+ assert result.schema.field(5).name == "f"
+ assert result.schema.field(6).name == "g"
+
+ assert result.column(0) == pa.array([1, 2, 3])
+ assert result.column(1) == pa.array([4, 5, 6])
+ assert result.column(2) == pa.array([5, 7, 9])
+ assert result.column(3) == pa.array([5, 7, 9])
+ assert result.column(4) == pa.array([5, 7, 9])
+ assert result.column(5) == pa.array([5, 7, 9])
+ assert result.column(6) == pa.array([5, 7, 9])
def test_cast(df):
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]