This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new a960e71905e [SPARK-44841][PS] Support `value_counts` for pandas 2.0.0 and above a960e71905e is described below commit a960e71905e35aee4b7152baec587b23c9183694 Author: itholic <haejoon....@databricks.com> AuthorDate: Thu Aug 17 16:50:38 2023 +0800 [SPARK-44841][PS] Support `value_counts` for pandas 2.0.0 and above ### What changes were proposed in this pull request? This PR proposes to support object.value_counts for pandas 2.0.0 by matching the behavior. See https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#value-counts-sets-the-resulting-name-to-count more detail. ### Why are the changes needed? We should match the behavior with the latest pandas. ### Does this PR introduce _any_ user-facing change? Yes, the behavior now following the pandas 2.0.0 and above. ### How was this patch tested? Enabling the existing UT Closes #42525 from itholic/pandas_value_counts. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/pandas/base.py | 49 ++++++++++++++++--------- python/pyspark/pandas/tests/series/test_stat.py | 4 -- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index 0685af76987..1cb17de89e8 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1317,26 +1317,29 @@ class IndexOpsMixin(object, metaclass=ABCMeta): >>> df = ps.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE + x 1.0 3 0.0 2 - Name: x, dtype: int64 + Name: count, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE + x 1.0 0.6 0.0 0.4 - Name: x, dtype: float64 + Name: proportion, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE + x 1.0 3 0.0 2 NaN 1 - Name: x, dtype: int64 + Name: count, dtype: int64 For Index @@ -1349,7 +1352,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): 2.0 1 3.0 2 4.0 1 - dtype: int64 + Name: count, dtype: int64 **sort** @@ -1360,7 +1363,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): 2.0 1 3.0 2 4.0 1 - dtype: int64 + Name: count, dtype: int64 **normalize** @@ -1372,7 +1375,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): 2.0 0.2 3.0 0.4 4.0 0.2 - dtype: float64 + Name: proportion, dtype: float64 **dropna** @@ -1411,7 +1414,7 @@ class IndexOpsMixin(object, metaclass=ABCMeta): (falcon, length) 2 (falcon, weight) 1 (lama, weight) 3 - dtype: int64 + Name: count, dtype: int64 >>> s.index.value_counts(normalize=True).sort_index() (cow, length) 0.111111 @@ -1419,31 +1422,37 @@ class IndexOpsMixin(object, metaclass=ABCMeta): (falcon, length) 0.222222 (falcon, weight) 0.111111 (lama, weight) 0.333333 - dtype: float64 + Name: proportion, dtype: float64 If Index has name, keep the name up. >>> idx = ps.Index([0, 0, 0, 1, 1, 2, 3], name='pandas-on-Spark') >>> idx.value_counts().sort_index() + pandas-on-Spark 0 3 1 2 2 1 3 1 - Name: pandas-on-Spark, dtype: int64 + Name: count, dtype: int64 """ - from pyspark.pandas.series import first_series, Series - - if isinstance(self, Series): - warnings.warn( - "The resulting Series will have a fixed name of 'count' from 4.0.0.", - FutureWarning, - ) + from pyspark.pandas.series import first_series + from pyspark.pandas.indexes.multi import MultiIndex if bins is not None: raise NotImplementedError("value_counts currently does not support bins") if dropna: - sdf_dropna = self._internal.spark_frame.select(self.spark.column).dropna() + if isinstance(self, MultiIndex): + # If even one StructField is null, that row should be dropped. + index_spark_column_names = self._internal.index_spark_column_names + spark_column = self.spark.column + cond = F.lit(False) + for index_spark_column_name in index_spark_column_names: + cond = cond | spark_column.getItem(index_spark_column_name).isNull() + sdf = self._internal.spark_frame.select(spark_column) + sdf_dropna = sdf.filter(~cond) + else: + sdf_dropna = self._internal.spark_frame.select(self.spark.column).dropna() else: sdf_dropna = self._internal.spark_frame.select(self.spark.column) index_name = SPARK_DEFAULT_INDEX_NAME @@ -1456,13 +1465,17 @@ class IndexOpsMixin(object, metaclass=ABCMeta): sdf = sdf.orderBy(F.col("count").desc()) if normalize: + result_column_name = "proportion" drop_sum = sdf_dropna.count() sdf = sdf.withColumn("count", F.col("count") / F.lit(drop_sum)) + else: + result_column_name = "count" internal = InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, index_name)], - column_labels=self._internal.column_labels, + index_names=self._internal.column_labels, + column_labels=[(result_column_name,)], data_spark_columns=[scol_for(sdf, "count")], column_label_names=self._internal.column_label_names, ) diff --git a/python/pyspark/pandas/tests/series/test_stat.py b/python/pyspark/pandas/tests/series/test_stat.py index 048a4c94fd9..2c25e21954d 100644 --- a/python/pyspark/pandas/tests/series/test_stat.py +++ b/python/pyspark/pandas/tests/series/test_stat.py @@ -53,10 +53,6 @@ class SeriesStatMixin: self.assertEqual(ps.Series(range(100)).nunique(approx=True), 103) self.assertEqual(ps.Series(range(100)).nunique(approx=True, rsd=0.01), 100) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43464): Enable SeriesTests.test_value_counts for pandas 2.0.0.", - ) def test_value_counts(self): # this is also containing test for Index & MultiIndex pser = pd.Series( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org