This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2f122ba6d13 [SPARK-38837][PYTHON] Implement `dropna` parameter of
`SeriesGroupBy.value_counts`
2f122ba6d13 is described below
commit 2f122ba6d13ea26411fa4bf3e636ced449a8a205
Author: Xinrong Meng <[email protected]>
AuthorDate: Mon Apr 11 13:38:04 2022 +0900
[SPARK-38837][PYTHON] Implement `dropna` parameter of
`SeriesGroupBy.value_counts`
### What changes were proposed in this pull request?
Implement `dropna` parameter of `SeriesGroupBy.value_counts` to exclude
counts of NaN.
It also fixes the behavior of `self._dropna` in the context of
`SeriesGroupBy.value_counts`.
### Why are the changes needed?
To reach parity with pandas.
### Does this PR introduce _any_ user-facing change?
Yes. `dropna` parameter of `SeriesGroupBy.value_counts` is supported.
```py
>>> psdf = ps.DataFrame(
... {"A": [np.nan, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3,
np.nan]}, columns=["A", "B"]
... )
>>> psdf.groupby("A")["B"].value_counts(dropna=False).sort_index()
A B
2.0 1.0 1
2.0 1
3.0 3.0 2
NaN 1
Name: B, dtype: int64
>>> psdf.groupby("A",
dropna=False)["B"].value_counts(dropna=False).sort_index() # self.dropna=False
A B
2.0 1.0 1
2.0 1
3.0 3.0 2
NaN 1
NaN 1.0 1
Name: B, dtype: int64
>>> psdf.groupby("A")["B"].value_counts(dropna=True).sort_index()
A B
2.0 1.0 1
2.0 1
3.0 3.0 2
Name: B, dtype: int64
```
### How was this patch tested?
Unit tests.
Closes #36093 from xinrong-databricks/SeriesGroupBy.value_counts.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/groupby.py | 45 +++++++++++++++++++++--------
python/pyspark/pandas/tests/test_groupby.py | 38 ++++++++++++++++++++++--
2 files changed, 68 insertions(+), 15 deletions(-)
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index addb53d8cd5..6ef698015dd 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -3205,23 +3205,35 @@ class SeriesGroupBy(GroupBy[Series]):
Examples
--------
>>> df = ps.DataFrame({'A': [1, 2, 2, 3, 3, 3],
- ... 'B': [1, 1, 2, 3, 3, 3]},
+ ... 'B': [1, 1, 2, 3, 3, np.nan]},
... columns=['A', 'B'])
>>> df
- A B
- 0 1 1
- 1 2 1
- 2 2 2
- 3 3 3
- 4 3 3
- 5 3 3
+ A B
+ 0 1 1.0
+ 1 2 1.0
+ 2 2 2.0
+ 3 3 3.0
+ 4 3 3.0
+ 5 3 NaN
>>> df.groupby('A')['B'].value_counts().sort_index() # doctest:
+NORMALIZE_WHITESPACE
A B
- 1 1 1
- 2 1 1
- 2 1
- 3 3 3
+ 1 1.0 1
+ 2 1.0 1
+ 2.0 1
+ 3 3.0 2
+ Name: B, dtype: int64
+
+ Don't include counts of NaN when dropna is False.
+
+ >>> df.groupby('A')['B'].value_counts(
+ ... dropna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE
+ A B
+ 1 1.0 1
+ 2 1.0 1
+ 2.0 1
+ 3 3.0 2
+ NaN 1
Name: B, dtype: int64
"""
groupkeys = self._groupkeys + self._agg_columns
@@ -3229,9 +3241,18 @@ class SeriesGroupBy(GroupBy[Series]):
groupkey_cols = [s.spark.column.alias(name) for s, name in
zip(groupkeys, groupkey_names)]
sdf = self._psdf._internal.spark_frame
+
agg_column = self._agg_columns[0]._internal.data_spark_column_names[0]
sdf = sdf.groupby(*groupkey_cols).count().withColumnRenamed("count",
agg_column)
+ if self._dropna:
+ _groupkey_column_names = groupkey_names[: len(self._groupkeys)]
+ sdf = sdf.dropna(subset=_groupkey_column_names)
+
+ if dropna:
+ _agg_columns_names = groupkey_names[len(self._groupkeys) :]
+ sdf = sdf.dropna(subset=_agg_columns_names)
+
if sort:
if ascending:
sdf = sdf.orderBy(scol_for(sdf, agg_column).asc())
diff --git a/python/pyspark/pandas/tests/test_groupby.py
b/python/pyspark/pandas/tests/test_groupby.py
index ec17e0dba27..8beedcabf54 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -1054,24 +1054,56 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
self.assertTrue(sorted(act) == sorted(exp))
def test_value_counts(self):
- pdf = pd.DataFrame({"A": [1, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, 3]},
columns=["A", "B"])
+ pdf = pd.DataFrame(
+ {"A": [np.nan, 2, 2, 3, 3, 3], "B": [1, 1, 2, 3, 3, np.nan]},
columns=["A", "B"]
+ )
psdf = ps.from_pandas(pdf)
self.assert_eq(
psdf.groupby("A")["B"].value_counts().sort_index(),
pdf.groupby("A")["B"].value_counts().sort_index(),
)
+ self.assert_eq(
+ psdf.groupby("A")["B"].value_counts(dropna=False).sort_index(),
+ pdf.groupby("A")["B"].value_counts(dropna=False).sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A",
dropna=False)["B"].value_counts(dropna=False).sort_index(),
+ pdf.groupby("A",
dropna=False)["B"].value_counts(dropna=False).sort_index(),
+ # Returns are the same considering values and types,
+ # disable check_exact to pass the assert_eq
+ check_exact=False,
+ )
self.assert_eq(
psdf.groupby("A")["B"].value_counts(sort=True,
ascending=False).sort_index(),
pdf.groupby("A")["B"].value_counts(sort=True,
ascending=False).sort_index(),
)
self.assert_eq(
- psdf.groupby("A")["B"].value_counts(sort=True,
ascending=True).sort_index(),
- pdf.groupby("A")["B"].value_counts(sort=True,
ascending=True).sort_index(),
+ psdf.groupby("A")["B"]
+ .value_counts(sort=True, ascending=False, dropna=False)
+ .sort_index(),
+ pdf.groupby("A")["B"]
+ .value_counts(sort=True, ascending=False, dropna=False)
+ .sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("A")["B"]
+ .value_counts(sort=True, ascending=True, dropna=False)
+ .sort_index(),
+ pdf.groupby("A")["B"]
+ .value_counts(sort=True, ascending=True, dropna=False)
+ .sort_index(),
)
self.assert_eq(
psdf.B.rename().groupby(psdf.A).value_counts().sort_index(),
pdf.B.rename().groupby(pdf.A).value_counts().sort_index(),
)
+ self.assert_eq(
+ psdf.B.rename().groupby(psdf.A,
dropna=False).value_counts().sort_index(),
+ pdf.B.rename().groupby(pdf.A,
dropna=False).value_counts().sort_index(),
+ # Returns are the same considering values and types,
+ # disable check_exact to pass the assert_eq
+ check_exact=False,
+ )
self.assert_eq(
psdf.B.groupby(psdf.A.rename()).value_counts().sort_index(),
pdf.B.groupby(pdf.A.rename()).value_counts().sort_index(),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]