This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d165de8c04c [SPARK-39246][PS] Implement Groupby.skew
d165de8c04c is described below
commit d165de8c04c41de5c67925cf670b2d7211c4da68
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Sun May 22 13:42:12 2022 +0800
[SPARK-39246][PS] Implement Groupby.skew
### What changes were proposed in this pull request?
Implement Groupby.skew
### Why are the changes needed?
for api coverage
### Does this PR introduce _any_ user-facing change?
yes, new api added
```
In [4]: df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False,
True], "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})
In [5]: df.groupby("A").skew()
Out[5]:
B C
A
1 -1.732051 1.732051
2 NaN NaN
```
### How was this patch tested?
added UT
Closes #36624 from zhengruifeng/ps_groupby_skew_kurt.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/groupby.py | 34 +++++++++++++++++++++++++++++
python/pyspark/pandas/missing/groupby.py | 2 --
python/pyspark/pandas/tests/test_groupby.py | 2 +-
3 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index d1cff8e960d..03e6a038232 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -54,6 +54,7 @@ else:
_builtin_table = SelectionMixin._builtin_table # type:
ignore[attr-defined]
+from pyspark import SparkContext
from pyspark.sql import Column, DataFrame as SparkDataFrame, Window, functions
as F
from pyspark.sql.types import (
BooleanType,
@@ -725,6 +726,39 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
bool_to_numeric=True,
)
+ def skew(self) -> FrameLike:
+ """
+ Compute skewness of groups, excluding missing values.
+
+ .. versionadded:: 3.4.0
+
+ Examples
+ --------
+ >>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False,
True],
+ ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})
+
+ >>> df.groupby("A").skew()
+ B C
+ A
+ 1 -1.732051 1.732051
+ 2 NaN NaN
+
+ See Also
+ --------
+ pyspark.pandas.Series.groupby
+ pyspark.pandas.DataFrame.groupby
+ """
+
+ def skew(scol: Column) -> Column:
+ sql_utils = SparkContext._active_spark_context._jvm.PythonSQLUtils
+ return Column(sql_utils.pandasSkewness(scol._jc))
+
+ return self._reduce_for_stat_function(
+ skew,
+ accepted_spark_types=(NumericType,),
+ bool_to_numeric=True,
+ )
+
# TODO: skipna should be implemented.
def all(self) -> FrameLike:
"""
diff --git a/python/pyspark/pandas/missing/groupby.py
b/python/pyspark/pandas/missing/groupby.py
index d0867e4982f..3ea443ebd6e 100644
--- a/python/pyspark/pandas/missing/groupby.py
+++ b/python/pyspark/pandas/missing/groupby.py
@@ -52,7 +52,6 @@ class MissingPandasLikeDataFrameGroupBy:
ngroups = _unsupported_property("ngroups")
plot = _unsupported_property("plot")
quantile = _unsupported_property("quantile")
- skew = _unsupported_property("skew")
tshift = _unsupported_property("tshift")
# Deprecated properties
@@ -87,7 +86,6 @@ class MissingPandasLikeSeriesGroupBy:
ngroups = _unsupported_property("ngroups")
plot = _unsupported_property("plot")
quantile = _unsupported_property("quantile")
- skew = _unsupported_property("skew")
tshift = _unsupported_property("tshift")
# Deprecated properties
diff --git a/python/pyspark/pandas/tests/test_groupby.py
b/python/pyspark/pandas/tests/test_groupby.py
index 1375d7a9bc0..045cbaf5274 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -266,7 +266,7 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils):
funcs = [
((True, False), ["sum", "min", "max", "count", "first", "last"]),
((True, True), ["mean"]),
- ((False, False), ["var", "std"]),
+ ((False, False), ["var", "std", "skew"]),
]
funcs = [(check_exact, almost, f) for (check_exact, almost), fs in
funcs for f in fs]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]