[spark] branch master updated: [SPARK-44841][PS] Support `value_counts` for pandas 2.0.0 and above

ruifengz Thu, 17 Aug 2023 01:51:01 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new a960e71905e [SPARK-44841][PS] Support `value_counts` for pandas 2.0.0 
and above
a960e71905e is described below

commit a960e71905e35aee4b7152baec587b23c9183694
Author: itholic <haejoon....@databricks.com>
AuthorDate: Thu Aug 17 16:50:38 2023 +0800

    [SPARK-44841][PS] Support `value_counts` for pandas 2.0.0 and above
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to support object.value_counts for pandas 2.0.0 by 
matching the behavior. See 
https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count
 more detail.
    
    ### Why are the changes needed?
    
    We should match the behavior with the latest pandas.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the behavior now following the pandas 2.0.0 and above.
    
    ### How was this patch tested?
    
    Enabling the existing UT
    
    Closes #42525 from itholic/pandas_value_counts.
    
    Authored-by: itholic <haejoon....@databricks.com>
    Signed-off-by: Ruifeng Zheng <ruife...@apache.org>
---
 python/pyspark/pandas/base.py                   | 49 ++++++++++++++++---------
 python/pyspark/pandas/tests/series/test_stat.py |  4 --
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index 0685af76987..1cb17de89e8 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -1317,26 +1317,29 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
 
         >>> df = ps.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
         >>> df.x.value_counts()  # doctest: +NORMALIZE_WHITESPACE
+        x
         1.0    3
         0.0    2
-        Name: x, dtype: int64
+        Name: count, dtype: int64
 
         With `normalize` set to `True`, returns the relative frequency by
         dividing all values by the sum of values.
 
         >>> df.x.value_counts(normalize=True)  # doctest: +NORMALIZE_WHITESPACE
+        x
         1.0    0.6
         0.0    0.4
-        Name: x, dtype: float64
+        Name: proportion, dtype: float64
 
         **dropna**
         With `dropna` set to `False` we can also see NaN index values.
 
         >>> df.x.value_counts(dropna=False)  # doctest: +NORMALIZE_WHITESPACE
+        x
         1.0    3
         0.0    2
         NaN    1
-        Name: x, dtype: int64
+        Name: count, dtype: int64
 
         For Index
 
@@ -1349,7 +1352,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         2.0    1
         3.0    2
         4.0    1
-        dtype: int64
+        Name: count, dtype: int64
 
         **sort**
 
@@ -1360,7 +1363,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         2.0    1
         3.0    2
         4.0    1
-        dtype: int64
+        Name: count, dtype: int64
 
         **normalize**
 
@@ -1372,7 +1375,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         2.0    0.2
         3.0    0.4
         4.0    0.2
-        dtype: float64
+        Name: proportion, dtype: float64
 
         **dropna**
 
@@ -1411,7 +1414,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         (falcon, length)    2
         (falcon, weight)    1
         (lama, weight)      3
-        dtype: int64
+        Name: count, dtype: int64
 
         >>> s.index.value_counts(normalize=True).sort_index()
         (cow, length)       0.111111
@@ -1419,31 +1422,37 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
         (falcon, length)    0.222222
         (falcon, weight)    0.111111
         (lama, weight)      0.333333
-        dtype: float64
+        Name: proportion, dtype: float64
 
         If Index has name, keep the name up.
 
         >>> idx = ps.Index([0, 0, 0, 1, 1, 2, 3], name='pandas-on-Spark')
         >>> idx.value_counts().sort_index()
+        pandas-on-Spark
         0    3
         1    2
         2    1
         3    1
-        Name: pandas-on-Spark, dtype: int64
+        Name: count, dtype: int64
         """
-        from pyspark.pandas.series import first_series, Series
-
-        if isinstance(self, Series):
-            warnings.warn(
-                "The resulting Series will have a fixed name of 'count' from 
4.0.0.",
-                FutureWarning,
-            )
+        from pyspark.pandas.series import first_series
+        from pyspark.pandas.indexes.multi import MultiIndex
 
         if bins is not None:
             raise NotImplementedError("value_counts currently does not support 
bins")
 
         if dropna:
-            sdf_dropna = 
self._internal.spark_frame.select(self.spark.column).dropna()
+            if isinstance(self, MultiIndex):
+                # If even one StructField is null, that row should be dropped.
+                index_spark_column_names = 
self._internal.index_spark_column_names
+                spark_column = self.spark.column
+                cond = F.lit(False)
+                for index_spark_column_name in index_spark_column_names:
+                    cond = cond | 
spark_column.getItem(index_spark_column_name).isNull()
+                sdf = self._internal.spark_frame.select(spark_column)
+                sdf_dropna = sdf.filter(~cond)
+            else:
+                sdf_dropna = 
self._internal.spark_frame.select(self.spark.column).dropna()
         else:
             sdf_dropna = self._internal.spark_frame.select(self.spark.column)
         index_name = SPARK_DEFAULT_INDEX_NAME
@@ -1456,13 +1465,17 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
                 sdf = sdf.orderBy(F.col("count").desc())
 
         if normalize:
+            result_column_name = "proportion"
             drop_sum = sdf_dropna.count()
             sdf = sdf.withColumn("count", F.col("count") / F.lit(drop_sum))
+        else:
+            result_column_name = "count"
 
         internal = InternalFrame(
             spark_frame=sdf,
             index_spark_columns=[scol_for(sdf, index_name)],
-            column_labels=self._internal.column_labels,
+            index_names=self._internal.column_labels,
+            column_labels=[(result_column_name,)],
             data_spark_columns=[scol_for(sdf, "count")],
             column_label_names=self._internal.column_label_names,
         )
diff --git a/python/pyspark/pandas/tests/series/test_stat.py 
b/python/pyspark/pandas/tests/series/test_stat.py
index 048a4c94fd9..2c25e21954d 100644
--- a/python/pyspark/pandas/tests/series/test_stat.py
+++ b/python/pyspark/pandas/tests/series/test_stat.py
@@ -53,10 +53,6 @@ class SeriesStatMixin:
         self.assertEqual(ps.Series(range(100)).nunique(approx=True), 103)
         self.assertEqual(ps.Series(range(100)).nunique(approx=True, rsd=0.01), 
100)
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43464): Enable SeriesTests.test_value_counts for pandas 
2.0.0.",
-    )
     def test_value_counts(self):
         # this is also containing test for Index & MultiIndex
         pser = pd.Series(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-44841][PS] Support `value_counts` for pandas 2.0.0 and above

Reply via email to