This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 05507db907c [SPARK-38952][PYTHON] Implement `numeric_only` of
`GroupBy.first` and `GroupBy.last`
05507db907c is described below
commit 05507db907c4f7845e86bde6a39226e1697ea638
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Apr 22 11:46:39 2022 +0900
[SPARK-38952][PYTHON] Implement `numeric_only` of `GroupBy.first` and
`GroupBy.last`
### What changes were proposed in this pull request?
Implement `numeric_only` of `GroupBy.first/last`.
### Why are the changes needed?
To increase API coverage.
### Does this PR introduce _any_ user-facing change?
Yes. `numeric_only` of `GroupBy.first/last` is supported.
```py
>>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],
"C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})
>>> df
A B C D
0 1 True 3 a
1 2 False 4 b
2 1 False 3 b
3 2 True 4 a
>>> df.groupby("A").first(numeric_only=True)
B C
A
1 True 3
2 False 4
>>> df.groupby("A").last(numeric_only=True)
B C
A
1 False 3
2 True 4
```
### How was this patch tested?
Unit tests.
Closes #36266 from xinrong-databricks/num_only.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../pandas_on_spark/supported_pandas_api.rst | 4 +-
python/pyspark/pandas/groupby.py | 78 ++++++++++++++++++++--
python/pyspark/pandas/tests/test_groupby.py | 53 ++++++---------
3 files changed, 97 insertions(+), 38 deletions(-)
diff --git
a/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
b/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
index 8ddbbbac42e..06d044d0190 100644
--- a/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
+++ b/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
@@ -1570,7 +1570,7 @@ Supported GroupBy APIs
+-----------------------------------------+-------------+-----------------------------------------+
| :func:`GroupBy.filter` | Y |
|
+-----------------------------------------+-------------+-----------------------------------------+
-| :func:`GroupBy.first` | P | ``numeric_only``,
``min_count`` |
+| :func:`GroupBy.first` | P | ``min_count``
|
+-----------------------------------------+-------------+-----------------------------------------+
| :func:`GroupBy.get_group` | Y |
|
+-----------------------------------------+-------------+-----------------------------------------+
@@ -1586,7 +1586,7 @@ Supported GroupBy APIs
+-----------------------------------------+-------------+-----------------------------------------+
| indices | N |
|
+-----------------------------------------+-------------+-----------------------------------------+
-| :func:`GroupBy.last` | P | ``numeric_only``,
``min_count`` |
+| :func:`GroupBy.last` | P | ``min_count``
|
+-----------------------------------------+-------------+-----------------------------------------+
| mad | N |
|
+-----------------------------------------+-------------+-----------------------------------------+
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index f5003f8577c..15469e604d0 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -403,28 +403,98 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
return self._reduce_for_stat_function(F.count, only_numeric=False)
# TODO: We should fix See Also when Series implementation is finished.
- def first(self) -> FrameLike:
+ def first(self, numeric_only: Optional[bool] = False) -> FrameLike:
"""
Compute first of group values.
+ Parameters
+ ----------
+ numeric_only : bool, default False
+ Include only float, int, boolean columns. If None, will attempt to
use
+ everything, then use only numeric data.
+
+ .. versionadded:: 3.4.0
+
See Also
--------
pyspark.pandas.Series.groupby
pyspark.pandas.DataFrame.groupby
+
+ Examples
+ --------
+ >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False,
True],
+ ... "C": [3, 3, 4, 4], "D": ["a", "b", "b", "a"]})
+ >>> df
+ A B C D
+ 0 1 True 3 a
+ 1 2 False 3 b
+ 2 1 False 4 b
+ 3 2 True 4 a
+
+ >>> df.groupby("A").first().sort_index()
+ B C D
+ A
+ 1 True 3 a
+ 2 False 3 b
+
+ Include only float, int, boolean columns when set numeric_only True.
+
+ >>> df.groupby("A").first(numeric_only=True).sort_index()
+ B C
+ A
+ 1 True 3
+ 2 False 3
"""
- return self._reduce_for_stat_function(F.first, only_numeric=False)
+ return self._reduce_for_stat_function(
+ F.first, only_numeric=numeric_only, bool_as_numeric=True
+ )
- def last(self) -> FrameLike:
+ def last(self, numeric_only: Optional[bool] = False) -> FrameLike:
"""
Compute last of group values.
+ Parameters
+ ----------
+ numeric_only : bool, default False
+ Include only float, int, boolean columns. If None, will attempt to
use
+ everything, then use only numeric data.
+
+ .. versionadded:: 3.4.0
+
See Also
--------
pyspark.pandas.Series.groupby
pyspark.pandas.DataFrame.groupby
+
+ Examples
+ --------
+ >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False,
True],
+ ... "C": [3, 3, 4, 4], "D": ["a", "b", "b", "a"]})
+ >>> df
+ A B C D
+ 0 1 True 3 a
+ 1 2 False 3 b
+ 2 1 False 4 b
+ 3 2 True 4 a
+
+ >>> df.groupby("A").last().sort_index()
+ B C D
+ A
+ 1 False 4 b
+ 2 True 4 a
+
+ Include only float, int, boolean columns when set numeric_only True.
+
+ >>> df.groupby("A").last(numeric_only=True).sort_index()
+ B C
+ A
+ 1 False 4
+ 2 True 4
"""
return self._reduce_for_stat_function(
- lambda col: F.last(col, ignorenulls=True), only_numeric=False
+ lambda col: F.last(col, ignorenulls=True),
+ only_numeric=numeric_only,
+ bool_as_numeric=True,
)
def max(self, numeric_only: Optional[bool] = False) -> FrameLike:
diff --git a/python/pyspark/pandas/tests/test_groupby.py
b/python/pyspark/pandas/tests/test_groupby.py
index 32b1378a876..d43919728e1 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -1242,7 +1242,8 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
pdf.groupby([("x", "a"), ("x", "b")]).rank().sort_index(),
)
- def test_min(self):
+ # TODO: All statistical functions should leverage this utility
+ def _test_stat_func(self, func):
pdf = pd.DataFrame(
{
"A": [1, 2, 1, 2],
@@ -1256,39 +1257,27 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
(pdf.groupby("A"), psdf.groupby("A")),
(pdf.groupby("A")[["C"]], psdf.groupby("A")[["C"]]),
]:
- self.assert_eq(p_groupby_obj.min().sort_index(),
ps_groupby_obj.min().sort_index())
- self.assert_eq(
- p_groupby_obj.min(numeric_only=None).sort_index(),
- ps_groupby_obj.min(numeric_only=None).sort_index(),
- )
- self.assert_eq(
- p_groupby_obj.min(numeric_only=True).sort_index(),
- ps_groupby_obj.min(numeric_only=True).sort_index(),
- )
+ self.assert_eq(func(p_groupby_obj).sort_index(),
func(ps_groupby_obj).sort_index())
+
+ def test_min(self):
+ self._test_stat_func(lambda groupby_obj: groupby_obj.min())
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=None))
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=True))
def test_max(self):
- pdf = pd.DataFrame(
- {
- "A": [1, 2, 1, 2],
- "B": [3.1, 4.1, 4.1, 3.1],
- "C": ["a", "b", "b", "a"],
- "D": [True, False, False, True],
- }
- )
- psdf = ps.from_pandas(pdf)
- for p_groupby_obj, ps_groupby_obj in [
- (pdf.groupby("A"), psdf.groupby("A")),
- (pdf.groupby("A")[["C"]], psdf.groupby("A")[["C"]]),
- ]:
- self.assert_eq(p_groupby_obj.max().sort_index(),
ps_groupby_obj.max().sort_index())
- self.assert_eq(
- p_groupby_obj.max(numeric_only=None).sort_index(),
- ps_groupby_obj.max(numeric_only=None).sort_index(),
- )
- self.assert_eq(
- p_groupby_obj.max(numeric_only=True).sort_index(),
- ps_groupby_obj.max(numeric_only=True).sort_index(),
- )
+ self._test_stat_func(lambda groupby_obj: groupby_obj.max())
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.max(numeric_only=None))
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.max(numeric_only=True))
+
+ def test_first(self):
+ self._test_stat_func(lambda groupby_obj: groupby_obj.first())
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.first(numeric_only=None))
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.first(numeric_only=True))
+
+ def test_last(self):
+ self._test_stat_func(lambda groupby_obj: groupby_obj.last())
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.last(numeric_only=None))
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.last(numeric_only=True))
def test_cumcount(self):
pdf = pd.DataFrame(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]