This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 211ce40888d [SPARK-40561][PS] Implement `min_count` in `GroupBy.min`
211ce40888d is described below
commit 211ce40888dcaaa3c3ffbd316109e17d0caad4e3
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Sep 27 09:53:09 2022 +0800
[SPARK-40561][PS] Implement `min_count` in `GroupBy.min`
### What changes were proposed in this pull request?
Implement `min_count` in `GroupBy.min`
### Why are the changes needed?
for API coverage
### Does this PR introduce _any_ user-facing change?
yes, new parameter `min_count` supported
```
>>> df.groupby("D").min(min_count=3).sort_index()
A B C
D
a 1.0 False 3.0
b NaN None NaN
```
### How was this patch tested?
added UT and doctest
Closes #37998 from zhengruifeng/ps_groupby_min.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/groupby.py | 41 ++++++++++++++++++++++++++---
python/pyspark/pandas/tests/test_groupby.py | 2 ++
2 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 6d36cfecce6..7085d2ec059 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -643,16 +643,23 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
bool_to_numeric=True,
)
- def min(self, numeric_only: Optional[bool] = False) -> FrameLike:
+ def min(self, numeric_only: Optional[bool] = False, min_count: int = -1)
-> FrameLike:
"""
Compute min of group values.
+ .. versionadded:: 3.3.0
+
Parameters
----------
numeric_only : bool, default False
Include only float, int, boolean columns. If None, will attempt to
use
everything, then use only numeric data.
+ .. versionadded:: 3.4.0
+ min_count : bool, default -1
+ The required number of valid values to perform the operation. If
fewer
+ than min_count non-NA values are present the result will be NA.
+
.. versionadded:: 3.4.0
See Also
@@ -663,7 +670,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
Examples
--------
>>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False,
True],
- ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})
+ ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]})
>>> df.groupby("A").min().sort_index()
B C D
A
@@ -677,9 +684,37 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
A
1 False 3
2 False 4
+
+ >>> df.groupby("D").min().sort_index()
+ A B C
+ D
+ a 1 False 3
+ b 1 False 3
+
+
+ >>> df.groupby("D").min(min_count=3).sort_index()
+ A B C
+ D
+ a 1.0 False 3.0
+ b NaN None NaN
"""
+ if not isinstance(min_count, int):
+ raise TypeError("min_count must be integer")
+
+ if min_count > 0:
+
+ def min(col: Column) -> Column:
+ return F.when(
+ F.count(F.when(~F.isnull(col), F.lit(0))) < min_count,
F.lit(None)
+ ).otherwise(F.min(col))
+
+ else:
+
+ def min(col: Column) -> Column:
+ return F.min(col)
+
return self._reduce_for_stat_function(
- F.min, accepted_spark_types=(NumericType, BooleanType) if
numeric_only else None
+ min, accepted_spark_types=(NumericType, BooleanType) if
numeric_only else None
)
# TODO: sync the doc.
diff --git a/python/pyspark/pandas/tests/test_groupby.py
b/python/pyspark/pandas/tests/test_groupby.py
index 4a57a3421df..f0b3a04be17 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -1401,8 +1401,10 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
def test_min(self):
self._test_stat_func(lambda groupby_obj: groupby_obj.min())
+ self._test_stat_func(lambda groupby_obj: groupby_obj.min(min_count=2))
self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=None))
self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=True))
+ self._test_stat_func(lambda groupby_obj:
groupby_obj.min(numeric_only=True, min_count=2))
def test_max(self):
self._test_stat_func(lambda groupby_obj: groupby_obj.max())
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]