This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8cbc741320d [SPARK-45267][PS] Change the default value for numeric_only
8cbc741320d is described below
commit 8cbc741320dac60ce814ce0a9b3e72239248efb8
Author: Haejoon Lee <[email protected]>
AuthorDate: Wed Sep 27 14:04:54 2023 +0800
[SPARK-45267][PS] Change the default value for numeric_only
### What changes were proposed in this pull request?
This PR proposes to change the default value for `numeric_only` with
related functions.
### Why are the changes needed?
There are many functions that support `numeric_only` parameter have changed
their default value from `True` to `False` from Pandas 2.0.0, so we should
follow their behavior. See
https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html for more detail.
### Does this PR introduce _any_ user-facing change?
Yes, the default value for `numeric_only` is changed to `False`.
### How was this patch tested?
Updated the related UTs.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43043 from itholic/numeric_only.
Authored-by: Haejoon Lee <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/frame.py | 38 +++++++--------
python/pyspark/pandas/groupby.py | 54 +++++++---------------
python/pyspark/pandas/series.py | 13 ++++--
.../pandas/tests/computation/test_compute.py | 8 +++-
4 files changed, 47 insertions(+), 66 deletions(-)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 08450c0be87..faa595f80e3 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -747,7 +747,7 @@ class DataFrame(Frame, Generic[T]):
sfun: Callable[["Series"], PySparkColumn],
name: str,
axis: Optional[Axis] = None,
- numeric_only: bool = True,
+ numeric_only: bool = False,
skipna: bool = True,
**kwargs: Any,
) -> "Series":
@@ -762,10 +762,8 @@ class DataFrame(Frame, Generic[T]):
axis: used only for sanity check because the series only supports
index axis.
name : original pandas API name.
axis : axis to apply. 0 or 1, or 'index' or 'columns.
- numeric_only : bool, default True
- Include only float, int, boolean columns. False is not supported.
This parameter
- is mainly for pandas compatibility. Only 'DataFrame.count' uses
this parameter
- currently.
+ numeric_only : bool, default False
+ Include only float, int, boolean columns.
skipna : bool, default True
Exclude NA/null values when computing the result.
"""
@@ -11150,7 +11148,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
# TODO: add axis, pct, na_option parameter
def rank(
- self, method: str = "average", ascending: bool = True, numeric_only:
Optional[bool] = None
+ self, method: str = "average", ascending: bool = True, numeric_only:
bool = False
) -> "DataFrame":
"""
Compute numerical data ranks (1 through n) along axis. Equal values are
@@ -11171,9 +11169,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
* dense: like 'min', but rank always increases by 1 between groups
ascending : boolean, default True
False for ranks by high (1) to low (N)
- numeric_only : bool, optional
+ numeric_only : bool, default False
For DataFrame objects, rank only numeric columns if set to True.
+ .. versionchanged:: 4.0.0
+ The default value of ``numeric_only`` is now ``False``.
+
+
Returns
-------
ranks : same type as caller
@@ -11238,11 +11240,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
2 2.5
3 4.0
"""
- warnings.warn(
- "Default value of `numeric_only` will be changed to `False` "
- "instead of `None` in 4.0.0.",
- FutureWarning,
- )
if numeric_only:
numeric_col_names = []
for label in self._internal.column_labels:
@@ -12206,7 +12203,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
self,
q: Union[float, Iterable[float]] = 0.5,
axis: Axis = 0,
- numeric_only: bool = True,
+ numeric_only: bool = False,
accuracy: int = 10000,
) -> DataFrameOrSeries:
"""
@@ -12222,9 +12219,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
0 <= q <= 1, the quantile(s) to compute.
axis : int or str, default 0 or 'index'
Can only be set to 0 now.
- numeric_only : bool, default True
- If False, the quantile of datetime and time delta data will be
computed as well.
- Can only be set to True now.
+ numeric_only : bool, default False
+ Include only `float`, `int` or `boolean` data.
+
+ .. versionchanged:: 4.0.0
+ The default value of ``numeric_only`` is now ``False``.
+
accuracy : int, optional
Default accuracy of approximation. Larger value means better
accuracy.
The relative error can be deduced by 1.0 / accuracy.
@@ -12821,12 +12821,6 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
if numeric_only is None and axis == 0:
numeric_only = True
- warnings.warn(
- "Default value of `numeric_only` will be changed to `False` "
- "instead of `True` in 4.0.0.",
- FutureWarning,
- )
-
mode_scols: List[PySparkColumn] = []
mode_col_names: List[str] = []
mode_labels: List[Label] = []
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 7bd64376152..3d51fabd4b2 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -611,18 +611,17 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
min_count=min_count,
)
- def mean(self, numeric_only: Optional[bool] = True) -> FrameLike:
+ def mean(self, numeric_only: Optional[bool] = False) -> FrameLike:
"""
Compute mean of groups, excluding missing values.
Parameters
----------
- numeric_only : bool, default True
- Include only float, int, boolean columns. If None, will attempt to
use
- everything, then use only numeric data. False is not supported.
- This parameter is mainly for pandas compatibility.
+ numeric_only : bool, default False
+ Include only float, int, boolean columns.
.. versionadded:: 3.4.0
+ .. versionchanged:: 4.0.0
Returns
-------
@@ -842,7 +841,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
bool_to_numeric=True,
)
- def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) ->
FrameLike:
+ def sum(self, numeric_only: bool = False, min_count: int = 0) -> FrameLike:
"""
Compute sum of group values
@@ -851,11 +850,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
Parameters
----------
numeric_only : bool, default False
- Include only float, int, boolean columns. If None, will attempt to
use
- everything, then use only numeric data.
- It takes no effect since only numeric columns can be support here.
+ Include only float, int, boolean columns.
.. versionadded:: 3.4.0
+ .. versionchanged:: 4.0.0
min_count : int, default 0
The required number of valid values to perform the operation.
If fewer than min_count non-NA values are present the result will
be NA.
@@ -897,11 +895,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
pyspark.pandas.Series.groupby
pyspark.pandas.DataFrame.groupby
"""
- warnings.warn(
- "Default value of `numeric_only` will be changed to `False` "
- "instead of `True` in 4.0.0.",
- FutureWarning,
- )
if numeric_only is not None and not isinstance(numeric_only, bool):
raise TypeError("numeric_only must be None or bool")
if not isinstance(min_count, int):
@@ -927,7 +920,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
)
# TODO: sync the doc.
- def var(self, ddof: int = 1, numeric_only: Optional[bool] = True) ->
FrameLike:
+ def var(self, ddof: int = 1, numeric_only: bool = False) -> FrameLike:
"""
Compute variance of groups, excluding missing values.
@@ -942,10 +935,8 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
.. versionchanged:: 3.4.0
Supported including arbitary integers.
- numeric_only : bool, default True
- Include only float, int, boolean columns. If None, will attempt
to use
- everything, then use only numeric data. False is not supported.
- This parameter is mainly for pandas compatibility.
+ numeric_only : bool, default False
+ Include only float, int, boolean columns.
.. versionadded:: 4.0.0
@@ -1179,7 +1170,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
return self._prepare_return(DataFrame(internal),
agg_column_names=agg_column_names)
- def prod(self, numeric_only: Optional[bool] = True, min_count: int = 0) ->
FrameLike:
+ def prod(self, numeric_only: bool = False, min_count: int = 0) ->
FrameLike:
"""
Compute prod of groups.
@@ -1188,8 +1179,9 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
Parameters
----------
numeric_only : bool, default False
- Include only float, int, boolean columns. If None, will attempt to
use
- everything, then use only numeric data.
+ Include only float, int, boolean columns.
+
+ .. versionchanged:: 4.0.0
min_count : int, default 0
The required number of valid values to perform the operation.
@@ -1235,12 +1227,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
if not isinstance(min_count, int):
raise TypeError("min_count must be integer")
- warnings.warn(
- "Default value of `numeric_only` will be changed to `False` "
- "instead of `True` in 4.0.0.",
- FutureWarning,
- )
-
self._validate_agg_columns(numeric_only=numeric_only,
function_name="prod")
return self._reduce_for_stat_function(
@@ -3441,7 +3427,7 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
return self._handle_output(DataFrame(internal))
- def median(self, numeric_only: Optional[bool] = True, accuracy: int =
10000) -> FrameLike:
+ def median(self, numeric_only: bool = False, accuracy: int = 10000) ->
FrameLike:
"""
Compute median of groups, excluding missing values.
@@ -3454,10 +3440,10 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
Parameters
----------
numeric_only : bool, default False
- Include only float, int, boolean columns. If None, will attempt to
use
- everything, then use only numeric data.
+ Include only float, int, boolean columns.
.. versionadded:: 3.4.0
+ .. versionchanged:: 4.0.0
Returns
-------
@@ -3509,12 +3495,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
self._validate_agg_columns(numeric_only=numeric_only,
function_name="median")
- warnings.warn(
- "Default value of `numeric_only` will be changed to `False` "
- "instead of `True` in 4.0.0.",
- FutureWarning,
- )
-
def stat_function(col: Column) -> Column:
return F.percentile_approx(col, 0.5, accuracy)
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index f1b785e1b41..e96e5c3b3dc 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -4054,7 +4054,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
# TODO: add axis, pct, na_option parameter
def rank(
- self, method: str = "average", ascending: bool = True, numeric_only:
Optional[bool] = None
+ self, method: str = "average", ascending: bool = True, numeric_only:
bool = False
) -> "Series":
"""
Compute numerical data ranks (1 through n) along axis. Equal values are
@@ -4075,9 +4075,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
* dense: like 'min', but rank always increases by 1 between groups
ascending : boolean, default True
False for ranks by high (1) to low (N)
- numeric_only : bool, optional
- If set to True, rank numeric Series, or raise TypeError for
non-numeric Series.
- False is not supported. This parameter is mainly for pandas
compatibility.
+ numeric_only : bool, default False
+ For DataFrame objects, rank only numeric columns if set to True.
+
+ .. versionchanged:: 4.0.0
+ The default value of ``numeric_only`` is now ``False``.
+
Returns
-------
@@ -7033,7 +7036,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
sfun: Callable[["Series"], PySparkColumn],
name: str_type,
axis: Optional[Axis] = None,
- numeric_only: bool = True,
+ numeric_only: bool = False,
skipna: bool = True,
**kwargs: Any,
) -> Scalar:
diff --git a/python/pyspark/pandas/tests/computation/test_compute.py
b/python/pyspark/pandas/tests/computation/test_compute.py
index dc145601fca..7f17a3bc6f0 100644
--- a/python/pyspark/pandas/tests/computation/test_compute.py
+++ b/python/pyspark/pandas/tests/computation/test_compute.py
@@ -352,9 +352,10 @@ class FrameComputeMixin:
pdf = pd.DataFrame({"x": ["a", "b", "c"]})
psdf = ps.from_pandas(pdf)
- self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5,
numeric_only=True))
+ self.assert_eq(psdf.quantile(0.5, numeric_only=True),
pdf.quantile(0.5, numeric_only=True))
self.assert_eq(
- psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75],
numeric_only=True)
+ psdf.quantile([0.25, 0.5, 0.75], numeric_only=True),
+ pdf.quantile([0.25, 0.5, 0.75], numeric_only=True),
)
with self.assertRaisesRegex(TypeError, "Could not convert object
\\(string\\) to numeric"):
@@ -432,6 +433,9 @@ class FrameComputeMixin:
class FrameComputeTests(FrameComputeMixin, ComparisonTestBase, SQLTestUtils):
+ def test_quantile(self):
+ super().test_quantile()
+
pass
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]