This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 548184079bc [SPARK-39186][PYTHON] Make pandas-on-Spark's skew
consistent with pandas
548184079bc is described below
commit 548184079bc0131f235fc65540911996f7aa6c86
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Sun May 15 09:30:55 2022 +0900
[SPARK-39186][PYTHON] Make pandas-on-Spark's skew consistent with pandas
the logics of computing skewness are different between spark sql and pandas:
spark sql: [`sqrt(n) * m3 / sqrt(m2 * m2 *
m2))`](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L304)
pandas: [`(count * (count - 1) ** 0.5 / (count - 2)) * (m3 /
m2**1.5)`](https://github.com/pandas-dev/pandas/blob/main/pandas/core/nanops.py#L1221)
to make skew consistent with pandas
yes, the logic to compute skew was changed
added UT
Closes #36549 from zhengruifeng/adjust_pandas_skew.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 7e4519c9a8ba35958ef6d408be3ca4e97917c965)
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 386c75693b5b9dd5e3b2147d49f0284badaa7d6d)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/generic.py | 11 ++++++++++-
python/pyspark/pandas/tests/test_stats.py | 6 ++++++
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 2fc79f6eb5f..ad3fcd74e57 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -1459,7 +1459,16 @@ class Frame(object, metaclass=ABCMeta):
spark_type_to_pandas_dtype(spark_type),
spark_type.simpleString()
)
)
- return F.skewness(spark_column)
+
+ count_scol = F.count(F.when(~spark_column.isNull(),
1).otherwise(None))
+ # refer to the Pandas implementation 'nanskew'
+ #
https://github.com/pandas-dev/pandas/blob/main/pandas/core/nanops.py#L1152
+ return F.when(
+ count_scol > 2,
+ F.skewness(spark_column)
+ * F.sqrt(1 - 1 / count_scol)
+ * (count_scol / (count_scol - 2)),
+ ).otherwise(None)
return self._reduce_for_stat_function(
skew, name="skew", axis=axis, numeric_only=numeric_only
diff --git a/python/pyspark/pandas/tests/test_stats.py
b/python/pyspark/pandas/tests/test_stats.py
index 21366954e32..667b43d0c96 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/test_stats.py
@@ -183,6 +183,7 @@ class StatsTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
+ self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
@@ -220,6 +221,11 @@ class StatsTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(
psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1,
numeric_only=True)
)
+ self.assert_eq(
+ psdf.skew(axis=0, numeric_only=True),
+ pdf.skew(axis=0, numeric_only=True),
+ almost=True,
+ )
self.assert_eq(
psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1,
numeric_only=True)
)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]