[spark] branch master updated: [SPARK-45267][PS] Change the default value for numeric_only

ruifengz Tue, 26 Sep 2023 23:05:13 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 8cbc741320d [SPARK-45267][PS] Change the default value for numeric_only
8cbc741320d is described below

commit 8cbc741320dac60ce814ce0a9b3e72239248efb8
Author: Haejoon Lee <[email protected]>
AuthorDate: Wed Sep 27 14:04:54 2023 +0800

    [SPARK-45267][PS] Change the default value for numeric_only
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to change the default value for `numeric_only` with 
related functions.
    
    ### Why are the changes needed?
    
    There are many functions that support `numeric_only` parameter have changed 
their default value from `True` to `False` from Pandas 2.0.0, so we should 
follow their behavior. See 
https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html for more detail.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the default value for `numeric_only` is changed to `False`.
    
    ### How was this patch tested?
    
    Updated the related UTs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #43043 from itholic/numeric_only.
    
    Authored-by: Haejoon Lee <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/pandas/frame.py                     | 38 +++++++--------
 python/pyspark/pandas/groupby.py                   | 54 +++++++---------------
 python/pyspark/pandas/series.py                    | 13 ++++--
 .../pandas/tests/computation/test_compute.py       |  8 +++-
 4 files changed, 47 insertions(+), 66 deletions(-)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 08450c0be87..faa595f80e3 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -747,7 +747,7 @@ class DataFrame(Frame, Generic[T]):
         sfun: Callable[["Series"], PySparkColumn],
         name: str,
         axis: Optional[Axis] = None,
-        numeric_only: bool = True,
+        numeric_only: bool = False,
         skipna: bool = True,
         **kwargs: Any,
     ) -> "Series":
@@ -762,10 +762,8 @@ class DataFrame(Frame, Generic[T]):
             axis: used only for sanity check because the series only supports 
index axis.
         name : original pandas API name.
         axis : axis to apply. 0 or 1, or 'index' or 'columns.
-        numeric_only : bool, default True
-            Include only float, int, boolean columns. False is not supported. 
This parameter
-            is mainly for pandas compatibility. Only 'DataFrame.count' uses 
this parameter
-            currently.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
         skipna : bool, default True
             Exclude NA/null values when computing the result.
         """
@@ -11150,7 +11148,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
 
     # TODO: add axis, pct, na_option parameter
     def rank(
-        self, method: str = "average", ascending: bool = True, numeric_only: 
Optional[bool] = None
+        self, method: str = "average", ascending: bool = True, numeric_only: 
bool = False
     ) -> "DataFrame":
         """
         Compute numerical data ranks (1 through n) along axis. Equal values are
@@ -11171,9 +11169,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
             * dense: like 'min', but rank always increases by 1 between groups
         ascending : boolean, default True
             False for ranks by high (1) to low (N)
-        numeric_only : bool, optional
+        numeric_only : bool, default False
             For DataFrame objects, rank only numeric columns if set to True.
 
+            .. versionchanged:: 4.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
+
         Returns
         -------
         ranks : same type as caller
@@ -11238,11 +11240,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
         2  2.5
         3  4.0
         """
-        warnings.warn(
-            "Default value of `numeric_only` will be changed to `False` "
-            "instead of `None` in 4.0.0.",
-            FutureWarning,
-        )
         if numeric_only:
             numeric_col_names = []
             for label in self._internal.column_labels:
@@ -12206,7 +12203,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
         self,
         q: Union[float, Iterable[float]] = 0.5,
         axis: Axis = 0,
-        numeric_only: bool = True,
+        numeric_only: bool = False,
         accuracy: int = 10000,
     ) -> DataFrameOrSeries:
         """
@@ -12222,9 +12219,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
             0 <= q <= 1, the quantile(s) to compute.
         axis : int or str, default 0 or 'index'
             Can only be set to 0 now.
-        numeric_only : bool, default True
-            If False, the quantile of datetime and time delta data will be 
computed as well.
-            Can only be set to True now.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 4.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
         accuracy : int, optional
             Default accuracy of approximation. Larger value means better 
accuracy.
             The relative error can be deduced by 1.0 / accuracy.
@@ -12821,12 +12821,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
         if numeric_only is None and axis == 0:
             numeric_only = True
 
-        warnings.warn(
-            "Default value of `numeric_only` will be changed to `False` "
-            "instead of `True` in 4.0.0.",
-            FutureWarning,
-        )
-
         mode_scols: List[PySparkColumn] = []
         mode_col_names: List[str] = []
         mode_labels: List[Label] = []
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 7bd64376152..3d51fabd4b2 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -611,18 +611,17 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
             min_count=min_count,
         )
 
-    def mean(self, numeric_only: Optional[bool] = True) -> FrameLike:
+    def mean(self, numeric_only: Optional[bool] = False) -> FrameLike:
         """
         Compute mean of groups, excluding missing values.
 
         Parameters
         ----------
-        numeric_only : bool, default True
-            Include only float, int, boolean columns. If None, will attempt to 
use
-            everything, then use only numeric data. False is not supported.
-            This parameter is mainly for pandas compatibility.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
 
             .. versionadded:: 3.4.0
+            .. versionchanged:: 4.0.0
 
         Returns
         -------
@@ -842,7 +841,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
             bool_to_numeric=True,
         )
 
-    def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> 
FrameLike:
+    def sum(self, numeric_only: bool = False, min_count: int = 0) -> FrameLike:
         """
         Compute sum of group values
 
@@ -851,11 +850,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         Parameters
         ----------
         numeric_only : bool, default False
-            Include only float, int, boolean columns. If None, will attempt to 
use
-            everything, then use only numeric data.
-            It takes no effect since only numeric columns can be support here.
+            Include only float, int, boolean columns.
 
             .. versionadded:: 3.4.0
+            .. versionchanged:: 4.0.0
         min_count : int, default 0
             The required number of valid values to perform the operation.
             If fewer than min_count non-NA values are present the result will 
be NA.
@@ -897,11 +895,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         pyspark.pandas.Series.groupby
         pyspark.pandas.DataFrame.groupby
         """
-        warnings.warn(
-            "Default value of `numeric_only` will be changed to `False` "
-            "instead of `True` in 4.0.0.",
-            FutureWarning,
-        )
         if numeric_only is not None and not isinstance(numeric_only, bool):
             raise TypeError("numeric_only must be None or bool")
         if not isinstance(min_count, int):
@@ -927,7 +920,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         )
 
     # TODO: sync the doc.
-    def var(self, ddof: int = 1, numeric_only: Optional[bool] = True) -> 
FrameLike:
+    def var(self, ddof: int = 1, numeric_only: bool = False) -> FrameLike:
         """
         Compute variance of groups, excluding missing values.
 
@@ -942,10 +935,8 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
             .. versionchanged:: 3.4.0
                Supported including arbitary integers.
 
-        numeric_only : bool, default True
-             Include only float, int, boolean columns. If None, will attempt 
to use
-             everything, then use only numeric data. False is not supported.
-             This parameter is mainly for pandas compatibility.
+        numeric_only : bool, default False
+             Include only float, int, boolean columns.
 
              .. versionadded:: 4.0.0
 
@@ -1179,7 +1170,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
 
         return self._prepare_return(DataFrame(internal), 
agg_column_names=agg_column_names)
 
-    def prod(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> 
FrameLike:
+    def prod(self, numeric_only: bool = False, min_count: int = 0) -> 
FrameLike:
         """
         Compute prod of groups.
 
@@ -1188,8 +1179,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         Parameters
         ----------
         numeric_only : bool, default False
-            Include only float, int, boolean columns. If None, will attempt to 
use
-            everything, then use only numeric data.
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 4.0.0
 
         min_count : int, default 0
             The required number of valid values to perform the operation.
@@ -1235,12 +1227,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         if not isinstance(min_count, int):
             raise TypeError("min_count must be integer")
 
-        warnings.warn(
-            "Default value of `numeric_only` will be changed to `False` "
-            "instead of `True` in 4.0.0.",
-            FutureWarning,
-        )
-
         self._validate_agg_columns(numeric_only=numeric_only, 
function_name="prod")
 
         return self._reduce_for_stat_function(
@@ -3441,7 +3427,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
 
         return self._handle_output(DataFrame(internal))
 
-    def median(self, numeric_only: Optional[bool] = True, accuracy: int = 
10000) -> FrameLike:
+    def median(self, numeric_only: bool = False, accuracy: int = 10000) -> 
FrameLike:
         """
         Compute median of groups, excluding missing values.
 
@@ -3454,10 +3440,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
         Parameters
         ----------
         numeric_only : bool, default False
-            Include only float, int, boolean columns. If None, will attempt to 
use
-            everything, then use only numeric data.
+            Include only float, int, boolean columns.
 
             .. versionadded:: 3.4.0
+            .. versionchanged:: 4.0.0
 
         Returns
         -------
@@ -3509,12 +3495,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
 
         self._validate_agg_columns(numeric_only=numeric_only, 
function_name="median")
 
-        warnings.warn(
-            "Default value of `numeric_only` will be changed to `False` "
-            "instead of `True` in 4.0.0.",
-            FutureWarning,
-        )
-
         def stat_function(col: Column) -> Column:
             return F.percentile_approx(col, 0.5, accuracy)
 
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index f1b785e1b41..e96e5c3b3dc 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -4054,7 +4054,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
 
     # TODO: add axis, pct, na_option parameter
     def rank(
-        self, method: str = "average", ascending: bool = True, numeric_only: 
Optional[bool] = None
+        self, method: str = "average", ascending: bool = True, numeric_only: 
bool = False
     ) -> "Series":
         """
         Compute numerical data ranks (1 through n) along axis. Equal values are
@@ -4075,9 +4075,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
             * dense: like 'min', but rank always increases by 1 between groups
         ascending : boolean, default True
             False for ranks by high (1) to low (N)
-        numeric_only : bool, optional
-            If set to True, rank numeric Series, or raise TypeError for 
non-numeric Series.
-            False is not supported. This parameter is mainly for pandas 
compatibility.
+        numeric_only : bool, default False
+            For DataFrame objects, rank only numeric columns if set to True.
+
+            .. versionchanged:: 4.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
 
         Returns
         -------
@@ -7033,7 +7036,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         sfun: Callable[["Series"], PySparkColumn],
         name: str_type,
         axis: Optional[Axis] = None,
-        numeric_only: bool = True,
+        numeric_only: bool = False,
         skipna: bool = True,
         **kwargs: Any,
     ) -> Scalar:
diff --git a/python/pyspark/pandas/tests/computation/test_compute.py 
b/python/pyspark/pandas/tests/computation/test_compute.py
index dc145601fca..7f17a3bc6f0 100644
--- a/python/pyspark/pandas/tests/computation/test_compute.py
+++ b/python/pyspark/pandas/tests/computation/test_compute.py
@@ -352,9 +352,10 @@ class FrameComputeMixin:
         pdf = pd.DataFrame({"x": ["a", "b", "c"]})
         psdf = ps.from_pandas(pdf)
 
-        self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5, 
numeric_only=True))
+        self.assert_eq(psdf.quantile(0.5, numeric_only=True), 
pdf.quantile(0.5, numeric_only=True))
         self.assert_eq(
-            psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75], 
numeric_only=True)
+            psdf.quantile([0.25, 0.5, 0.75], numeric_only=True),
+            pdf.quantile([0.25, 0.5, 0.75], numeric_only=True),
         )
 
         with self.assertRaisesRegex(TypeError, "Could not convert object 
\\(string\\) to numeric"):
@@ -432,6 +433,9 @@ class FrameComputeMixin:
 
 
 class FrameComputeTests(FrameComputeMixin, ComparisonTestBase, SQLTestUtils):
+    def test_quantile(self):
+        super().test_quantile()
+
     pass
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-45267][PS] Change the default value for numeric_only

Reply via email to