This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c6b09c0b71e7 [SPARK-49894][PYTHON][CONNECT] Refine the string
representation of column field operations
c6b09c0b71e7 is described below
commit c6b09c0b71e772de0605a555df9e78cbc4439ed6
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Oct 8 09:37:17 2024 +0900
[SPARK-49894][PYTHON][CONNECT] Refine the string representation of column
field operations
### What changes were proposed in this pull request?
Refine the string representation of column field operations: `GetField`,
`WithField`, and `DropFields`
### Why are the changes needed?
make the string representations consistent between pyspark classic and
connect
### Does this PR introduce _any_ user-facing change?
yes
before
```
In [1]: from pyspark.sql import functions as sf
In [2]: c = sf.col("c")
In [3]: c.x
Out[3]: Column<'UnresolvedExtractValue(c, x)'>
```
after
```
In [1]: from pyspark.sql import functions as sf
In [2]: c = sf.col("c")
In [3]: c.x
Out[3]: Column<'c['x']'>
```
### How was this patch tested?
added ut
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #48369 from zhengruifeng/py_connect_col_str.
Lead-authored-by: Ruifeng Zheng <[email protected]>
Co-authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/connect/expressions.py | 6 +--
python/pyspark/sql/tests/test_column.py | 71 +++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 3 deletions(-)
diff --git a/python/pyspark/sql/connect/expressions.py
b/python/pyspark/sql/connect/expressions.py
index 0b5512b61925..85f1b3565c69 100644
--- a/python/pyspark/sql/connect/expressions.py
+++ b/python/pyspark/sql/connect/expressions.py
@@ -809,7 +809,7 @@ class WithField(Expression):
return expr
def __repr__(self) -> str:
- return f"WithField({self._structExpr}, {self._fieldName},
{self._valueExpr})"
+ return f"update_field({self._structExpr}, {self._fieldName},
{self._valueExpr})"
class DropField(Expression):
@@ -833,7 +833,7 @@ class DropField(Expression):
return expr
def __repr__(self) -> str:
- return f"DropField({self._structExpr}, {self._fieldName})"
+ return f"drop_field({self._structExpr}, {self._fieldName})"
class UnresolvedExtractValue(Expression):
@@ -857,7 +857,7 @@ class UnresolvedExtractValue(Expression):
return expr
def __repr__(self) -> str:
- return f"UnresolvedExtractValue({str(self._child)},
{str(self._extraction)})"
+ return f"{self._child}['{self._extraction}']"
class UnresolvedRegex(Expression):
diff --git a/python/pyspark/sql/tests/test_column.py
b/python/pyspark/sql/tests/test_column.py
index 1972dd2804d9..5f1991973d27 100644
--- a/python/pyspark/sql/tests/test_column.py
+++ b/python/pyspark/sql/tests/test_column.py
@@ -283,6 +283,77 @@ class ColumnTestsMixin:
when_cond = sf.when(expression, sf.lit(None))
self.assertEqual(str(when_cond), "Column<'CASE WHEN foo THEN NULL
END'>")
+ def test_col_field_ops_representation(self):
+ # SPARK-49894: Test string representation of columns
+ c = sf.col("c")
+
+ # getField
+ self.assertEqual(str(c.x), "Column<'c['x']'>")
+ self.assertEqual(str(c.x.y), "Column<'c['x']['y']'>")
+ self.assertEqual(str(c.x.y.z), "Column<'c['x']['y']['z']'>")
+
+ self.assertEqual(str(c["x"]), "Column<'c['x']'>")
+ self.assertEqual(str(c["x"]["y"]), "Column<'c['x']['y']'>")
+ self.assertEqual(str(c["x"]["y"]["z"]), "Column<'c['x']['y']['z']'>")
+
+ self.assertEqual(str(c.getField("x")), "Column<'c['x']'>")
+ self.assertEqual(
+ str(c.getField("x").getField("y")),
+ "Column<'c['x']['y']'>",
+ )
+ self.assertEqual(
+ str(c.getField("x").getField("y").getField("z")),
+ "Column<'c['x']['y']['z']'>",
+ )
+
+ self.assertEqual(str(c.getItem("x")), "Column<'c['x']'>")
+ self.assertEqual(
+ str(c.getItem("x").getItem("y")),
+ "Column<'c['x']['y']'>",
+ )
+ self.assertEqual(
+ str(c.getItem("x").getItem("y").getItem("z")),
+ "Column<'c['x']['y']['z']'>",
+ )
+
+ self.assertEqual(
+ str(c.x["y"].getItem("z")),
+ "Column<'c['x']['y']['z']'>",
+ )
+ self.assertEqual(
+ str(c["x"].getField("y").getItem("z")),
+ "Column<'c['x']['y']['z']'>",
+ )
+ self.assertEqual(
+ str(c.getField("x").getItem("y").z),
+ "Column<'c['x']['y']['z']'>",
+ )
+ self.assertEqual(
+ str(c["x"].y.getField("z")),
+ "Column<'c['x']['y']['z']'>",
+ )
+
+ # WithField
+ self.assertEqual(
+ str(c.withField("x", sf.col("y"))),
+ "Column<'update_field(c, x, y)'>",
+ )
+ self.assertEqual(
+ str(c.withField("x", sf.col("y")).withField("x", sf.col("z"))),
+ "Column<'update_field(update_field(c, x, y), x, z)'>",
+ )
+
+ # DropFields
+ self.assertEqual(str(c.dropFields("x")), "Column<'drop_field(c, x)'>")
+ self.assertEqual(
+ str(c.dropFields("x", "y")),
+ "Column<'drop_field(drop_field(c, x), y)'>",
+ )
+ self.assertEqual(
+ str(c.dropFields("x", "y", "z")),
+ "Column<'drop_field(drop_field(drop_field(c, x), y), z)'>",
+ )
+
def test_lit_time_representation(self):
dt = datetime.date(2021, 3, 4)
self.assertEqual(str(sf.lit(dt)), "Column<'2021-03-04'>")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]