This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 66f89f35f0e [SPARK-45090][PYTHON][CONNECT] DataFrame.{cube, rollup}`
support column ordinals
66f89f35f0e is described below
commit 66f89f35f0eadbb97de3654286af4b76870a12b9
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Thu Sep 7 10:04:21 2023 +0800
[SPARK-45090][PYTHON][CONNECT] DataFrame.{cube, rollup}` support column
ordinals
### What changes were proposed in this pull request?
`DataFrame.{cube, rollup}` support column ordinals
### Why are the changes needed?
for feature parity:
```
In [10]: df = spark.createDataFrame([(2, "Alice"), (5, "Bob")],
schema=["age", "name"])
In [11]: df.createOrReplaceTempView("v")
In [12]: spark.sql("SELECT name, age, COUNT(1) FROM v GROUP BY CUBE(1, 2)
ORDER BY 1, 2").show()
+-----+----+--------+
| name| age|count(1)|
+-----+----+--------+
| NULL|NULL| 2|
| NULL| 2| 1|
| NULL| 5| 1|
|Alice|NULL| 1|
|Alice| 2| 1|
| Bob|NULL| 1|
| Bob| 5| 1|
+-----+----+--------+
In [13]: df.select("name", "age").cube(1,
2).agg(sf.count(sf.lit(1))).orderBy(1, 2).show()
+-----+----+--------+
| name| age|count(1)|
+-----+----+--------+
| NULL|NULL| 2|
| NULL| 2| 1|
| NULL| 5| 1|
|Alice|NULL| 1|
|Alice| 2| 1|
| Bob|NULL| 1|
| Bob| 5| 1|
+-----+----+--------+
```
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
added doctest
### Was this patch authored or co-authored using generative AI tooling?
NO
Closes #42832 from zhengruifeng/py_cube_rollup_ordinals.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/dataframe.py | 12 +++++++++
python/pyspark/sql/dataframe.py | 46 ++++++++++++++++++++++++++++++---
2 files changed, 55 insertions(+), 3 deletions(-)
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index c443023ce02..639a60a0fb3 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -512,6 +512,12 @@ class DataFrame:
_cols.append(c)
elif isinstance(c, str):
_cols.append(self[c])
+ elif isinstance(c, int) and not isinstance(c, bool):
+ # TODO: should introduce dedicated error class
+ if c < 1:
+ raise IndexError(f"Column ordinal must be positive but got
{c}")
+ # ordinal is 1-based
+ _cols.append(self[c - 1])
else:
raise PySparkTypeError(
error_class="NOT_COLUMN_OR_STR",
@@ -529,6 +535,12 @@ class DataFrame:
_cols.append(c)
elif isinstance(c, str):
_cols.append(self[c])
+ elif isinstance(c, int) and not isinstance(c, bool):
+ # TODO: should introduce dedicated error class
+ if c < 1:
+ raise IndexError(f"Column ordinal must be positive but got
{c}")
+ # ordinal is 1-based
+ _cols.append(self[c - 1])
else:
raise PySparkTypeError(
error_class="NOT_COLUMN_OR_STR",
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index f59ae40542b..f00c8c5ab42 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -3924,7 +3924,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
def rollup(self, __cols: Union[List[Column], List[str]]) -> "GroupedData":
...
- def rollup(self, *cols: "ColumnOrName") -> "GroupedData": # type:
ignore[misc]
+ def rollup(self, *cols: "ColumnOrNameOrOrdinal") -> "GroupedData": #
type: ignore[misc]
"""
Create a multi-dimensional rollup for the current :class:`DataFrame`
using
the specified columns, so we can run aggregation on them.
@@ -3934,6 +3934,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
.. versionchanged:: 3.4.0
Supports Spark Connect.
+ .. versionchanged:: 4.0.0
+ Supports column ordinal.
+
Parameters
----------
cols : list, str or :class:`Column`
@@ -3946,6 +3949,11 @@ class DataFrame(PandasMapOpsMixin,
PandasConversionMixin):
:class:`GroupedData`
Rolled-up data by given columns.
+ Notes
+ -----
+ A column ordinal starts from 1, which is different from the
+ 0-based :meth:`__getitem__`.
+
Examples
--------
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")],
schema=["age", "name"])
@@ -3959,8 +3967,19 @@ class DataFrame(PandasMapOpsMixin,
PandasConversionMixin):
| Bob|NULL| 1|
| Bob| 5| 1|
+-----+----+-----+
+
+ >>> df.rollup(2, 1).count().orderBy(1, 2).show()
+ +-----+----+-----+
+ | name| age|count|
+ +-----+----+-----+
+ | NULL|NULL| 2|
+ |Alice|NULL| 1|
+ |Alice| 2| 1|
+ | Bob|NULL| 1|
+ | Bob| 5| 1|
+ +-----+----+-----+
"""
- jgd = self._jdf.rollup(self._jcols(*cols))
+ jgd = self._jdf.rollup(self._jcols_ordinal(*cols))
from pyspark.sql.group import GroupedData
return GroupedData(jgd, self)
@@ -3983,6 +4002,9 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin):
.. versionchanged:: 3.4.0
Supports Spark Connect.
+ .. versionchanged:: 4.0.0
+ Supports column ordinal.
+
Parameters
----------
cols : list, str or :class:`Column`
@@ -3995,6 +4017,11 @@ class DataFrame(PandasMapOpsMixin,
PandasConversionMixin):
:class:`GroupedData`
Cube of the data by given columns.
+ Notes
+ -----
+ A column ordinal starts from 1, which is different from the
+ 0-based :meth:`__getitem__`.
+
Examples
--------
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")],
schema=["age", "name"])
@@ -4010,8 +4037,21 @@ class DataFrame(PandasMapOpsMixin,
PandasConversionMixin):
| Bob|NULL| 1|
| Bob| 5| 1|
+-----+----+-----+
+
+ >>> df.cube(2, 1).count().orderBy(1, 2).show()
+ +-----+----+-----+
+ | name| age|count|
+ +-----+----+-----+
+ | NULL|NULL| 2|
+ | NULL| 2| 1|
+ | NULL| 5| 1|
+ |Alice|NULL| 1|
+ |Alice| 2| 1|
+ | Bob|NULL| 1|
+ | Bob| 5| 1|
+ +-----+----+-----+
"""
- jgd = self._jdf.cube(self._jcols(*cols))
+ jgd = self._jdf.cube(self._jcols_ordinal(*cols))
from pyspark.sql.group import GroupedData
return GroupedData(jgd, self)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]