This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new 1e0d88b481d [SPARK-42593][PS] Deprecate & remove the APIs that will be
removed in pandas 2.0
1e0d88b481d is described below
commit 1e0d88b481da052f5ca431d69bc010ef359a6496
Author: itholic <[email protected]>
AuthorDate: Wed Mar 1 18:31:34 2023 +0900
[SPARK-42593][PS] Deprecate & remove the APIs that will be removed in
pandas 2.0
### What changes were proposed in this pull request?
This PR proposes to mark the APIs as deprecated or remove the APIs that
will be deprecated or removed in upcoming pandas 2.0.0 release.
See [What's new in
2.0.0](https://pandas.pydata.org/pandas-docs/version/2.0/whatsnew/v2.0.0.html#removal-of-prior-version-deprecations-changes)
for more detail.
### Why are the changes needed?
We should match the behavior to pandas API.
### Does this PR introduce _any_ user-facing change?
Yes, some APIs will be removed, so they will be no longer available.
### How was this patch tested?
Fixed UTs when necessary case.
Closes #40216 from itholic/SPARK-42593.
Authored-by: itholic <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 9d2fe90c9c88e5cc781b8058087a1cb1bf94f22d)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/base.py | 86 +++++++++++-
python/pyspark/pandas/categorical.py | 174 ++++--------------------
python/pyspark/pandas/datetimes.py | 9 ++
python/pyspark/pandas/frame.py | 29 ++++
python/pyspark/pandas/generic.py | 6 +
python/pyspark/pandas/groupby.py | 35 ++++-
python/pyspark/pandas/indexes/base.py | 21 ++-
python/pyspark/pandas/indexes/numeric.py | 4 +
python/pyspark/pandas/missing/frame.py | 4 -
python/pyspark/pandas/missing/groupby.py | 2 -
python/pyspark/pandas/missing/indexes.py | 2 -
python/pyspark/pandas/missing/resample.py | 4 -
python/pyspark/pandas/missing/series.py | 6 -
python/pyspark/pandas/namespace.py | 20 +++
python/pyspark/pandas/plot/matplotlib.py | 3 +
python/pyspark/pandas/series.py | 24 +++-
python/pyspark/pandas/tests/test_categorical.py | 101 +-------------
python/pyspark/pandas/window.py | 2 +-
18 files changed, 268 insertions(+), 264 deletions(-)
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index bb9671e8da6..cd0f5a13aee 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -18,6 +18,7 @@
"""
Base and utility classes for pandas-on-Spark objects.
"""
+import warnings
from abc import ABCMeta, abstractmethod
from functools import wraps, partial
from itertools import chain
@@ -544,6 +545,8 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
.. note:: Disable the Spark config
`spark.sql.optimizer.nestedSchemaPruning.enabled`
for multi-index if you're using pandas-on-Spark < 1.7.0 with
PySpark 3.1.1.
+ .. deprecated:: 3.4.0
+
Returns
-------
is_monotonic : bool
@@ -605,9 +608,88 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
>>> midx.is_monotonic
False
"""
+ warnings.warn(
+ "is_monotonic is deprecated and will be removed in a future
version. "
+ "Use is_monotonic_increasing instead.",
+ FutureWarning,
+ )
return self._is_monotonic("increasing")
- is_monotonic_increasing = is_monotonic
+ @property
+ def is_monotonic_increasing(self) -> bool:
+ """
+ Return boolean if values in the object are monotonically increasing.
+
+ .. note:: the current implementation of is_monotonic_increasing
requires to shuffle
+ and aggregate multiple times to check the order locally and
globally,
+ which is potentially expensive. In case of multi-index, all data is
+ transferred to a single node which can easily cause out-of-memory
errors.
+
+ .. note:: Disable the Spark config
`spark.sql.optimizer.nestedSchemaPruning.enabled`
+ for multi-index if you're using pandas-on-Spark < 1.7.0 with
PySpark 3.1.1.
+
+ Returns
+ -------
+ is_monotonic : bool
+
+ Examples
+ --------
+ >>> ser = ps.Series(['1/1/2018', '3/1/2018', '4/1/2018'])
+ >>> ser.is_monotonic_increasing
+ True
+
+ >>> df = ps.DataFrame({'dates': [None, '1/1/2018', '2/1/2018',
'3/1/2018']})
+ >>> df.dates.is_monotonic_increasing
+ False
+
+ >>> df.index.is_monotonic_increasing
+ True
+
+ >>> ser = ps.Series([1])
+ >>> ser.is_monotonic_increasing
+ True
+
+ >>> ser = ps.Series([])
+ >>> ser.is_monotonic_increasing
+ True
+
+ >>>
ser.rename("a").to_frame().set_index("a").index.is_monotonic_increasing
+ True
+
+ >>> ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])
+ >>> ser.is_monotonic_increasing
+ False
+
+ >>> ser.index.is_monotonic_increasing
+ True
+
+ Support for MultiIndex
+
+ >>> midx = ps.MultiIndex.from_tuples(
+ ... [('x', 'a'), ('x', 'b'), ('y', 'c'), ('y', 'd'), ('z', 'e')])
+ >>> midx # doctest: +SKIP
+ MultiIndex([('x', 'a'),
+ ('x', 'b'),
+ ('y', 'c'),
+ ('y', 'd'),
+ ('z', 'e')],
+ )
+ >>> midx.is_monotonic_increasing
+ True
+
+ >>> midx = ps.MultiIndex.from_tuples(
+ ... [('z', 'a'), ('z', 'b'), ('y', 'c'), ('y', 'd'), ('x', 'e')])
+ >>> midx # doctest: +SKIP
+ MultiIndex([('z', 'a'),
+ ('z', 'b'),
+ ('y', 'c'),
+ ('y', 'd'),
+ ('x', 'e')],
+ )
+ >>> midx.is_monotonic_increasing
+ False
+ """
+ return self._is_monotonic("increasing")
@property
def is_monotonic_decreasing(self) -> bool:
@@ -1541,6 +1623,8 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
Value to mark "not found". If None, will not drop the NaN
from the uniques of the values.
+ .. deprecated:: 3.4.0
+
Returns
-------
codes : Series or Index
diff --git a/python/pyspark/pandas/categorical.py
b/python/pyspark/pandas/categorical.py
index 9a4c4076ef3..36b11caf5b6 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -172,9 +172,7 @@ class CategoricalAccessor:
),
).rename()
- def add_categories(
- self, new_categories: Union[pd.Index, Any, List], inplace: bool = False
- ) -> Optional["ps.Series"]:
+ def add_categories(self, new_categories: Union[pd.Index, Any, List]) ->
Optional["ps.Series"]:
"""
Add new categories.
@@ -185,11 +183,6 @@ class CategoricalAccessor:
----------
new_categories : category or list-like of category
The new categories to be included.
- inplace : bool, default False
- Whether or not to add the categories inplace or return a copy of
- this categorical with added categories.
-
- .. deprecated:: 3.2.0
Returns
-------
@@ -235,13 +228,6 @@ class CategoricalAccessor:
"""
from pyspark.pandas.frame import DataFrame
- if inplace:
- warnings.warn(
- "The `inplace` parameter in add_categories is deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
-
categories: List[Any]
if is_list_like(new_categories):
categories = list(new_categories)
@@ -262,11 +248,7 @@ class CategoricalAccessor:
dtype=CategoricalDtype(list(self.categories) + categories,
ordered=self.ordered)
),
)
- if inplace:
- self._data._psdf._update_internal_frame(internal)
- return None
- else:
- return
DataFrame(internal)._psser_for(self._data._column_label).copy()
+ return DataFrame(internal)._psser_for(self._data._column_label).copy()
def _set_ordered(self, *, ordered: bool, inplace: bool) ->
Optional["ps.Series"]:
from pyspark.pandas.frame import DataFrame
@@ -300,6 +282,8 @@ class CategoricalAccessor:
Whether or not to set the ordered attribute in-place or return
a copy of this categorical with ordered set to True.
+ .. deprecated:: 3.4.0
+
Returns
-------
Series or None
@@ -328,6 +312,12 @@ class CategoricalAccessor:
dtype: category
Categories (3, object): ['a' < 'b' < 'c']
"""
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in as_ordered is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
return self._set_ordered(ordered=True, inplace=inplace)
def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
@@ -340,6 +330,8 @@ class CategoricalAccessor:
Whether or not to set the ordered attribute in-place or return
a copy of this categorical with ordered set to False.
+ .. deprecated:: 3.4.0
+
Returns
-------
Series or None
@@ -368,11 +360,15 @@ class CategoricalAccessor:
dtype: category
Categories (3, object): ['a', 'b', 'c']
"""
+ if inplace:
+ warnings.warn(
+ "The `inplace` parameter in as_unordered is deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
return self._set_ordered(ordered=False, inplace=inplace)
- def remove_categories(
- self, removals: Union[pd.Index, Any, List], inplace: bool = False
- ) -> Optional["ps.Series"]:
+ def remove_categories(self, removals: Union[pd.Index, Any, List]) ->
Optional["ps.Series"]:
"""
Remove the specified categories.
@@ -383,11 +379,6 @@ class CategoricalAccessor:
----------
removals : category or list of categories
The categories which should be removed.
- inplace : bool, default False
- Whether or not to remove the categories inplace or return a copy of
- this categorical with removed categories.
-
- .. deprecated:: 3.2.0
Returns
-------
@@ -430,13 +421,6 @@ class CategoricalAccessor:
dtype: category
Categories (2, object): ['a', 'c']
"""
- if inplace:
- warnings.warn(
- "The `inplace` parameter in remove_categories is deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
-
categories: List[Any]
if is_list_like(removals):
categories = [cat for cat in removals if cat is not None]
@@ -455,39 +439,17 @@ class CategoricalAccessor:
)
if len(categories) == 0:
- if inplace:
- return None
- else:
- return self._data.copy()
+ return self._data.copy()
else:
dtype = CategoricalDtype(
[cat for cat in self.categories if cat not in categories],
ordered=self.ordered
)
- psser = self._data.astype(dtype)
-
- if inplace:
- internal = self._data._psdf._internal.with_new_spark_column(
- self._data._column_label,
- psser.spark.column,
- field=psser._internal.data_fields[0],
- )
- self._data._psdf._update_internal_frame(internal)
- return None
- else:
- return psser
+ return self._data.astype(dtype)
- def remove_unused_categories(self, inplace: bool = False) ->
Optional["ps.Series"]:
+ def remove_unused_categories(self) -> Optional["ps.Series"]:
"""
Remove categories which are not used.
- Parameters
- ----------
- inplace : bool, default False
- Whether or not to drop unused categories inplace or return a copy of
- this categorical with unused categories dropped.
-
- .. deprecated:: 3.2.0
-
Returns
-------
cat : Series or None
@@ -524,19 +486,12 @@ class CategoricalAccessor:
dtype: category
Categories (3, object): ['a', 'b', 'c']
"""
- if inplace:
- warnings.warn(
- "The `inplace` parameter in remove_unused_categories is
deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
-
categories = set(self._data.drop_duplicates()._to_pandas())
removals = [cat for cat in self.categories if cat not in categories]
- return self.remove_categories(removals=removals, inplace=inplace)
+ return self.remove_categories(removals=removals)
def rename_categories(
- self, new_categories: Union[list, dict, Callable], inplace: bool =
False
+ self, new_categories: Union[list, dict, Callable]
) -> Optional["ps.Series"]:
"""
Rename categories.
@@ -558,12 +513,6 @@ class CategoricalAccessor:
* callable : a callable that is called on all items in the old
categories and whose return values comprise the new categories.
- inplace : bool, default False
- Whether or not to rename the categories inplace or return a copy of
- this categorical with renamed categories.
-
- .. deprecated:: 3.2.0
-
Returns
-------
cat : Series or None
@@ -614,13 +563,6 @@ class CategoricalAccessor:
"""
from pyspark.pandas.frame import DataFrame
- if inplace:
- warnings.warn(
- "The `inplace` parameter in rename_categories is deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
-
if is_dict_like(new_categories):
categories = [cast(dict, new_categories).get(item, item) for item
in self.categories]
elif callable(new_categories):
@@ -642,17 +584,12 @@ class CategoricalAccessor:
),
)
- if inplace:
- self._data._psdf._update_internal_frame(internal)
- return None
- else:
- return
DataFrame(internal)._psser_for(self._data._column_label).copy()
+ return DataFrame(internal)._psser_for(self._data._column_label).copy()
def reorder_categories(
self,
new_categories: Union[pd.Index, List],
ordered: Optional[bool] = None,
- inplace: bool = False,
) -> Optional["ps.Series"]:
"""
Reorder categories as specified in new_categories.
@@ -667,11 +604,6 @@ class CategoricalAccessor:
ordered : bool, optional
Whether or not the categorical is treated as an ordered categorical.
If not given, do not change the ordered information.
- inplace : bool, default False
- Whether or not to reorder the categories inplace or return a copy of
- this categorical with reordered categories.
-
- .. deprecated:: 3.2.0
Returns
-------
@@ -715,13 +647,6 @@ class CategoricalAccessor:
dtype: category
Categories (3, object): ['c' < 'b' < 'a']
"""
- if inplace:
- warnings.warn(
- "The `inplace` parameter in reorder_categories is deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
-
if not is_list_like(new_categories):
raise TypeError(
"Parameter 'new_categories' must be list-like, was
'{}'".format(new_categories)
@@ -735,31 +660,16 @@ class CategoricalAccessor:
ordered = self.ordered
if new_categories == list(self.categories) and ordered == self.ordered:
- if inplace:
- return None
- else:
- return self._data.copy()
+ return self._data.copy()
else:
dtype = CategoricalDtype(categories=new_categories,
ordered=ordered)
- psser = _to_cat(self._data).astype(dtype)
-
- if inplace:
- internal = self._data._psdf._internal.with_new_spark_column(
- self._data._column_label,
- psser.spark.column,
- field=psser._internal.data_fields[0],
- )
- self._data._psdf._update_internal_frame(internal)
- return None
- else:
- return psser
+ return _to_cat(self._data).astype(dtype)
def set_categories(
self,
new_categories: Union[pd.Index, List],
ordered: Optional[bool] = None,
rename: bool = False,
- inplace: bool = False,
) -> Optional["ps.Series"]:
"""
Set the categories to the specified new_categories.
@@ -790,11 +700,6 @@ class CategoricalAccessor:
rename : bool, default False
Whether or not the new_categories should be considered as a rename
of the old categories or as reordered categories.
- inplace : bool, default False
- Whether or not to reorder the categories in-place or return a copy
- of this categorical with reordered categories.
-
- .. deprecated:: 3.2.0
Returns
-------
@@ -858,13 +763,6 @@ class CategoricalAccessor:
"""
from pyspark.pandas.frame import DataFrame
- if inplace:
- warnings.warn(
- "The `inplace` parameter in set_categories is deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
-
if not is_list_like(new_categories):
raise TypeError(
"Parameter 'new_categories' must be list-like, was
'{}'".format(new_categories)
@@ -889,23 +787,9 @@ class CategoricalAccessor:
field=self._data._internal.data_fields[0].copy(dtype=new_dtype),
)
- if inplace:
- self._data._psdf._update_internal_frame(internal)
- return None
- else:
- return
DataFrame(internal)._psser_for(self._data._column_label).copy()
+ return
DataFrame(internal)._psser_for(self._data._column_label).copy()
else:
- psser = self._data.astype(new_dtype)
- if inplace:
- internal = self._data._psdf._internal.with_new_spark_column(
- self._data._column_label,
- psser.spark.column,
- field=psser._internal.data_fields[0],
- )
- self._data._psdf._update_internal_frame(internal)
- return None
- else:
- return psser
+ return self._data.astype(new_dtype)
def _test() -> None:
diff --git a/python/pyspark/pandas/datetimes.py
b/python/pyspark/pandas/datetimes.py
index a9caf1977fb..752f6f46282 100644
--- a/python/pyspark/pandas/datetimes.py
+++ b/python/pyspark/pandas/datetimes.py
@@ -18,6 +18,7 @@
"""
Date/Time related functions on pandas-on-Spark Series
"""
+import warnings
from typing import Any, Optional, Union, no_type_check
import numpy as np
@@ -115,11 +116,19 @@ class DatetimeMethods:
def nanosecond(self) -> "ps.Series":
raise NotImplementedError()
+ # TODO(SPARK-42617): Support isocalendar.week and replace it.
+ # See also https://github.com/pandas-dev/pandas/pull/33595.
@property
def week(self) -> "ps.Series":
"""
The week ordinal of the year.
+
+ .. deprecated:: 3.4.0
"""
+ warnings.warn(
+ "weekofyear and week have been deprecated.",
+ FutureWarning,
+ )
return self._data.spark.transform(lambda c:
F.weekofyear(c).cast(LongType()))
@property
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 4a6c2119104..dd09331e49c 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -2508,6 +2508,9 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
The subset of columns to write. Writes all columns by default.
col_space : int, optional
The minimum width of each column.
+
+ .. deprecated:: 3.4.0
+
header : bool or list of str, default True
Write out the column names. If a list of strings is given, it is
assumed to be aliases
for the column names.
@@ -3494,6 +3497,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
).resolved_copy
return DataFrame(internal)
+ # TODO(SPARK-42620): Add `inclusive` parameter and replace `include_start`
& `include_end`.
+ # See https://github.com/pandas-dev/pandas/issues/43248
def between_time(
self,
start_time: Union[datetime.time, str],
@@ -3516,8 +3521,14 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
End time as a time filter limit.
include_start : bool, default True
Whether the start time needs to be included in the result.
+
+ .. deprecated:: 3.4.0
+
include_end : bool, default True
Whether the end time needs to be included in the result.
+
+ .. deprecated:: 3.4.0
+
axis : {0 or 'index', 1 or 'columns'}, default 0
Determine range time on index or columns value.
@@ -8813,6 +8824,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Columns in other that are not in the caller are added as new columns.
+ .. deprecated:: 3.4.0
+
Parameters
----------
other : DataFrame or Series/dict-like object, or list of these
@@ -8849,6 +8862,12 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
2 1 2
3 3 4
"""
+ warnings.warn(
+ "The DataFrame.append method is deprecated "
+ "and will be removed in a future version. "
+ "Use pyspark.pandas.concat instead.",
+ FutureWarning,
+ )
if isinstance(other, ps.Series):
raise TypeError("DataFrames.append() does not support appending
Series to DataFrames")
if sort:
@@ -11990,6 +12009,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
return cast(ps.Series,
ps.from_pandas(psdf._to_internal_pandas().idxmin()))
+ # TODO(SPARK-41619): Add `show_counts` parameter and replace with
`null_counts`.
def info(
self,
verbose: Optional[bool] = None,
@@ -12018,6 +12038,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
null_counts : bool, optional
Whether to show the non-null counts.
+ .. deprecated:: 3.4.0
+
Returns
-------
None
@@ -12663,6 +12685,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
"""
Return the mean absolute deviation of values.
+ .. deprecated:: 3.4.0
+
Parameters
----------
axis : {index (0), columns (1)}
@@ -12685,6 +12709,11 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
3 NaN
dtype: float64
"""
+ warnings.warn(
+ "The 'mad' method is deprecated and will be removed in a future
version. "
+ "To compute the same result, you may do `(df -
df.mean()).abs().mean()`.",
+ FutureWarning,
+ )
from pyspark.pandas.series import first_series
axis = validate_axis(axis)
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index ae0aaf07a44..786db59a29d 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -1099,11 +1099,17 @@ class Frame(object, metaclass=ABCMeta):
encoding: str, optional
Encoding of the resulting excel file. Only necessary for xlwt,
other writers support unicode natively.
+
+ .. deprecated:: 3.4.0
+
inf_rep: str, default 'inf'
Representation for infinity (there is no native representation for
infinity in Excel).
verbose: bool, default True
Display more information in the error logs.
+
+ .. deprecated:: 3.4.0
+
freeze_panes: tuple of int (length 2), optional
Specifies the one-based bottommost row and rightmost column that
is to be frozen.
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 35adba1e0a4..7a81ede4201 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -988,6 +988,8 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
.. versionadded:: 3.4.0
+ .. deprecated:: 3.4.0
+
Examples
--------
>>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False,
True],
@@ -1010,6 +1012,11 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
pyspark.pandas.Series.groupby
pyspark.pandas.DataFrame.groupby
"""
+ warnings.warn(
+ "The 'mad' method is deprecated and will be removed in a future
version. "
+ "To compute the same result, you may do `(group_df -
group_df.mean()).abs().mean()`.",
+ FutureWarning,
+ )
groupkey_names = [SPARK_INDEX_NAME_FORMAT(i) for i in
range(len(self._groupkeys))]
internal, agg_columns, sdf = self._prepare_reduce(
groupkey_names=groupkey_names,
@@ -2644,7 +2651,19 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
"""
return self.fillna(method="bfill", limit=limit)
- backfill = bfill
+ def backfill(self, limit: Optional[int] = None) -> FrameLike:
+ """
+ Alias for bfill.
+
+ .. deprecated:: 3.4.0
+ """
+ warnings.warn(
+ "The GroupBy.backfill method is deprecated "
+ "and will be removed in a future version. "
+ "Use GroupBy.bfill instead.",
+ FutureWarning,
+ )
+ return self.bfill(limit=limit)
def ffill(self, limit: Optional[int] = None) -> FrameLike:
"""
@@ -2695,7 +2714,19 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
"""
return self.fillna(method="ffill", limit=limit)
- pad = ffill
+ def pad(self, limit: Optional[int] = None) -> FrameLike:
+ """
+ Alias for ffill.
+
+ .. deprecated:: 3.4.0
+ """
+ warnings.warn(
+ "The GroupBy.pad method is deprecated "
+ "and will be removed in a future version. "
+ "Use GroupBy.ffill instead.",
+ FutureWarning,
+ )
+ return self.ffill(limit=limit)
def _limit(self, n: int, asc: bool) -> FrameLike:
"""
diff --git a/python/pyspark/pandas/indexes/base.py
b/python/pyspark/pandas/indexes/base.py
index 05c6b58032d..66d285b277f 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -645,6 +645,8 @@ class Index(IndexOpsMixin):
.. note:: This method should only be used if the resulting NumPy
ndarray is expected
to be small, as all the data is loaded into the driver's memory.
+ .. deprecated:: 3.4.0
+
Returns
-------
numpy.ndarray
@@ -660,7 +662,11 @@ class Index(IndexOpsMixin):
>>> ps.Index(['a', 'b', 'c']).asi8 is None
True
"""
- warnings.warn("We recommend using `{}.to_numpy()`
instead.".format(type(self).__name__))
+ warnings.warn(
+ "Index.asi8 is deprecated and will be removed in a future version.
"
+ "We recommend using `{}.to_numpy()`
instead.".format(type(self).__name__),
+ FutureWarning,
+ )
if isinstance(self.spark.data_type, IntegralType):
return self.to_numpy()
elif isinstance(self.spark.data_type, (TimestampType,
TimestampNTZType)):
@@ -1128,6 +1134,8 @@ class Index(IndexOpsMixin):
"""
Whether the index type is compatible with the provided type.
+ .. deprecated:: 3.4.0
+
Examples
--------
>>> psidx = ps.Index([1, 2, 3])
@@ -1140,6 +1148,10 @@ class Index(IndexOpsMixin):
>>> psidx.is_type_compatible('floating')
True
"""
+ warnings.warn(
+ "Index.is_type_compatible is deprecated and will be removed in a "
"future version",
+ FutureWarning,
+ )
return kind == self.inferred_type
def dropna(self, how: str = "any") -> "Index":
@@ -2185,6 +2197,8 @@ class Index(IndexOpsMixin):
remember that since pandas-on-Spark does not support multiple data
types in an index,
so it returns True if any type of data is datetime.
+ .. deprecated:: 3.4.0
+
Examples
--------
>>> from datetime import datetime
@@ -2210,6 +2224,11 @@ class Index(IndexOpsMixin):
>>> idx.is_all_dates
False
"""
+ warnings.warn(
+ "Index.is_all_dates is deprecated, will be removed in a future
version. "
+ "check index.inferred_type instead",
+ FutureWarning,
+ )
return isinstance(self.spark.data_type, (TimestampType,
TimestampNTZType))
def repeat(self, repeats: int) -> "Index":
diff --git a/python/pyspark/pandas/indexes/numeric.py
b/python/pyspark/pandas/indexes/numeric.py
index c3d2dabb79f..a124fefef51 100644
--- a/python/pyspark/pandas/indexes/numeric.py
+++ b/python/pyspark/pandas/indexes/numeric.py
@@ -48,6 +48,8 @@ class Int64Index(IntegerIndex):
storing axis labels for all pandas objects. Int64Index is a special case
of `Index` with purely integer labels.
+ .. deprecated:: 3.4.0
+
Parameters
----------
data : array-like (1-dimensional)
@@ -110,6 +112,8 @@ class Float64Index(NumericIndex):
storing axis labels for all pandas objects. Float64Index is a special case
of `Index` with purely float labels.
+ .. deprecated:: 3.4.0
+
Parameters
----------
data : array-like (1-dimensional)
diff --git a/python/pyspark/pandas/missing/frame.py
b/python/pyspark/pandas/missing/frame.py
index 4c4c31528b0..5ba81c81b36 100644
--- a/python/pyspark/pandas/missing/frame.py
+++ b/python/pyspark/pandas/missing/frame.py
@@ -53,10 +53,6 @@ class MissingPandasLikeDataFrame:
tz_localize = _unsupported_function("tz_localize")
# Deprecated functions
- tshift = _unsupported_function("tshift", deprecated=True, reason="Please
use shift instead.")
- slice_shift = _unsupported_function(
- "slice_shift", deprecated=True, reason="You can use
DataFrame/Series.shift instead."
- )
lookup = _unsupported_function(
"lookup", deprecated=True, reason="Use DataFrame.melt and
DataFrame.loc instead."
)
diff --git a/python/pyspark/pandas/missing/groupby.py
b/python/pyspark/pandas/missing/groupby.py
index 1799fac0033..55a4a1d5967 100644
--- a/python/pyspark/pandas/missing/groupby.py
+++ b/python/pyspark/pandas/missing/groupby.py
@@ -50,7 +50,6 @@ class MissingPandasLikeDataFrameGroupBy:
indices = _unsupported_property("indices")
ngroups = _unsupported_property("ngroups")
plot = _unsupported_property("plot")
- tshift = _unsupported_property("tshift")
# Deprecated properties
take = _unsupported_property("take", deprecated=True)
@@ -79,7 +78,6 @@ class MissingPandasLikeSeriesGroupBy:
is_monotonic_increasing = _unsupported_property("is_monotonic_increasing")
ngroups = _unsupported_property("ngroups")
plot = _unsupported_property("plot")
- tshift = _unsupported_property("tshift")
# Deprecated properties
take = _unsupported_property("take", deprecated=True)
diff --git a/python/pyspark/pandas/missing/indexes.py
b/python/pyspark/pandas/missing/indexes.py
index 99fcf367f3e..2419908b312 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -51,7 +51,6 @@ class MissingPandasLikeIndex:
get_indexer_non_unique = _unsupported_function("get_indexer_non_unique")
get_loc = _unsupported_function("get_loc")
get_slice_bound = _unsupported_function("get_slice_bound")
- get_value = _unsupported_function("get_value")
groupby = _unsupported_function("groupby")
is_ = _unsupported_function("is_")
join = _unsupported_function("join")
@@ -67,7 +66,6 @@ class MissingPandasLikeIndex:
is_mixed = _unsupported_function("is_mixed")
# Deprecated functions
- set_value = _unsupported_function("set_value", deprecated=True)
to_native_types = _unsupported_function("to_native_types", deprecated=True)
# Properties we won't support.
diff --git a/python/pyspark/pandas/missing/resample.py
b/python/pyspark/pandas/missing/resample.py
index a0e647e1e44..549e3c5be53 100644
--- a/python/pyspark/pandas/missing/resample.py
+++ b/python/pyspark/pandas/missing/resample.py
@@ -52,9 +52,7 @@ class MissingPandasLikeDataFrameResampler:
transform = _unsupported_function("transform")
pipe = _unsupported_function("pipe")
ffill = _unsupported_function("ffill")
- backfill = _unsupported_function("backfill")
bfill = _unsupported_function("bfill")
- pad = _unsupported_function("pad")
nearest = _unsupported_function("nearest")
fillna = _unsupported_function("fillna")
asfreq = _unsupported_function("asfreq")
@@ -86,9 +84,7 @@ class MissingPandasLikeSeriesResampler:
transform = _unsupported_function("transform")
pipe = _unsupported_function("pipe")
ffill = _unsupported_function("ffill")
- backfill = _unsupported_function("backfill")
bfill = _unsupported_function("bfill")
- pad = _unsupported_function("pad")
nearest = _unsupported_function("nearest")
fillna = _unsupported_function("fillna")
asfreq = _unsupported_function("asfreq")
diff --git a/python/pyspark/pandas/missing/series.py
b/python/pyspark/pandas/missing/series.py
index 74a544f0b94..4ee860d6654 100644
--- a/python/pyspark/pandas/missing/series.py
+++ b/python/pyspark/pandas/missing/series.py
@@ -48,12 +48,6 @@ class MissingPandasLikeSeries:
tz_localize = _unsupported_function("tz_localize")
view = _unsupported_function("view")
- # Deprecated functions
- slice_shift = _unsupported_function(
- "slice_shift", deprecated=True, reason="Use DataFrame/Series.shift
instead."
- )
- tshift = _unsupported_function("tshift", deprecated=True, reason="Use
`shift` instead.")
-
# Properties we won't support.
array = common.array(_unsupported_property)
nbytes = _unsupported_property(
diff --git a/python/pyspark/pandas/namespace.py
b/python/pyspark/pandas/namespace.py
index 70cc98468e9..6d5c4b79a39 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -261,11 +261,17 @@ def read_csv(
returning names where the callable function evaluates to `True`.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
+
+ .. deprecated:: 3.4.0
+
mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X0', 'X1', ... 'XN', rather
than 'X' ... 'X'. Passing in False will cause data to be overwritten if
there are duplicate names in the columns.
Currently only `True` is allowed.
+
+ .. deprecated:: 3.4.0
+
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32}
Use str or object
together with suitable na_values settings to preserve and not
interpret dtype.
@@ -978,6 +984,9 @@ def read_excel(
column if the callable returns ``True``.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
+
+ .. deprecated:: 3.4.0
+
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use `object` to preserve data as stored in Excel and not interpret
dtype.
@@ -1049,10 +1058,16 @@ def read_excel(
Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
data will be read in as floats: Excel stores all numbers as floats
internally.
+
+ .. deprecated:: 3.4.0
+
mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather
than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
+
+ .. deprecated:: 3.4.0
+
**kwds : optional
Optional keyword arguments can be passed to ``TextFileReader``.
@@ -1733,6 +1748,8 @@ def to_datetime(
)
+# TODO(SPARK-42621): Add `inclusive` parameter and replace `closed`.
+# See https://github.com/pandas-dev/pandas/issues/40245
def date_range(
start: Union[str, Any] = None,
end: Union[str, Any] = None,
@@ -1768,6 +1785,9 @@ def date_range(
closed : {None, 'left', 'right'}, optional
Make the interval closed with respect to the given frequency to
the 'left', 'right', or both sides (None, the default).
+
+ .. deprecated:: 3.4.0
+
**kwargs
For compatibility. Has no effect on the result.
diff --git a/python/pyspark/pandas/plot/matplotlib.py
b/python/pyspark/pandas/plot/matplotlib.py
index c43d60dd66e..b64586ae85b 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -837,6 +837,9 @@ def plot_frame(
labels with "(right)" in the legend
sort_columns: bool, default is False
When True, will sort values on plots.
+
+ .. deprecated:: 3.4.0
+
**kwds : keywords
Options to pass to matplotlib plotting method
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 2f55c911985..5d6c25eca69 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -3564,6 +3564,8 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
"""
Concatenate two or more Series.
+ .. deprecated:: 3.4.0
+
Parameters
----------
to_append : Series or list/tuple of Series
@@ -3611,6 +3613,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
5 6
dtype: int64
"""
+ warnings.warn(
+ "The Series.append method is deprecated "
+ "and will be removed in a future version. "
+ "Use pyspark.pandas.concat instead.",
+ FutureWarning,
+ )
return first_series(
self.to_frame().append(to_append.to_frame(), ignore_index,
verify_integrity)
).rename(self.name)
@@ -5903,6 +5911,8 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
"""
Return the mean absolute deviation of values.
+ .. deprecated:: 3.4.0
+
Examples
--------
>>> s = ps.Series([1, 2, 3, 4])
@@ -5916,7 +5926,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
>>> s.mad()
1.0
"""
-
+ warnings.warn(
+ "The 'mad' method is deprecated and will be removed in a future
version. "
+ "To compute the same result, you may do `(series -
series.mean()).abs().mean()`.",
+ FutureWarning,
+ )
sdf = self._internal.spark_frame
spark_column = self.spark.column
avg = unpack_scalar(sdf.select(F.avg(spark_column)))
@@ -6779,6 +6793,8 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
return (left_ser.copy(), right.copy()) if copy else (left_ser, right)
+ # TODO(SPARK-42620): Add `inclusive` parameter and replace `include_start`
& `include_end`.
+ # See https://github.com/pandas-dev/pandas/issues/43248
def between_time(
self,
start_time: Union[datetime.time, str],
@@ -6801,8 +6817,14 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
End time as a time filter limit.
include_start : bool, default True
Whether the start time needs to be included in the result.
+
+ .. deprecated:: 3.4.0
+
include_end : bool, default True
Whether the end time needs to be included in the result.
+
+ .. deprecated:: 3.4.0
+
axis : {0 or 'index', 1 or 'columns'}, default 0
Determine range time on index or columns value.
diff --git a/python/pyspark/pandas/tests/test_categorical.py
b/python/pyspark/pandas/tests/test_categorical.py
index d5a660a66ef..556265f8308 100644
--- a/python/pyspark/pandas/tests/test_categorical.py
+++ b/python/pyspark/pandas/tests/test_categorical.py
@@ -92,14 +92,8 @@ class CategoricalTest(ComparisonTestBase, TestUtils):
self.assert_eq(pser.cat.add_categories([4, 5]),
psser.cat.add_categories([4, 5]))
self.assert_eq(pser.cat.add_categories([]),
psser.cat.add_categories([]))
- pser.cat.add_categories(4, inplace=True)
- psser.cat.add_categories(4, inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=[1, 2, 3, 4]))
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
+ pser = pser.cat.add_categories(4)
+ psser = psser.cat.add_categories(4)
self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
self.assertRaises(ValueError, lambda: psser.cat.add_categories([5, 5]))
@@ -120,15 +114,6 @@ class CategoricalTest(ComparisonTestBase, TestUtils):
self.assert_eq(pser.cat.remove_categories(None),
psser.cat.remove_categories(None))
self.assert_eq(pser.cat.remove_categories([None]),
psser.cat.remove_categories([None]))
- pser.cat.remove_categories(2, inplace=True)
- psser.cat.remove_categories(2, inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=[1, 3]))
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
self.assertRaises(ValueError, lambda: psser.cat.remove_categories([4,
None]))
@@ -140,22 +125,13 @@ class CategoricalTest(ComparisonTestBase, TestUtils):
self.assert_eq(pser.cat.remove_unused_categories(),
psser.cat.remove_unused_categories())
- pser.cat.add_categories(4, inplace=True)
- pser.cat.remove_categories(2, inplace=True)
- psser.cat.add_categories(4, inplace=True)
- psser.cat.remove_categories(2, inplace=True)
+ pser = pser.cat.add_categories(4)
+ pser = pser.cat.remove_categories(2)
+ psser = psser.cat.add_categories(4)
+ psser = psser.cat.remove_categories(2)
self.assert_eq(pser.cat.remove_unused_categories(),
psser.cat.remove_unused_categories())
- pser.cat.remove_unused_categories(inplace=True)
- psser.cat.remove_unused_categories(inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=[1, 3]))
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
def test_reorder_categories(self):
pdf, psdf = self.df_pair
@@ -177,21 +153,6 @@ class CategoricalTest(ComparisonTestBase, TestUtils):
psser.cat.reorder_categories([3, 2, 1], ordered=True),
)
- pser.cat.reorder_categories([1, 2, 3], inplace=True)
- psser.cat.reorder_categories([1, 2, 3], inplace=True)
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
- pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
- psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=[3, 2, 1],
ordered=True))
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1,
2]))
self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1,
2, 4]))
self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1,
2, 2]))
@@ -671,35 +632,6 @@ class CategoricalTest(ComparisonTestBase, TestUtils):
psser.cat.rename_categories(lambda x: x.upper()),
)
- pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
- psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=["C", "b", "d",
"A"]))
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
- pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
- psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=["C", "B", "D",
"A"]))
- pdf.b = pser
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
- pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
- psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=[0, 1, 3, 2]))
- pdf.b = pser
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
self.assertRaisesRegex(
ValueError,
"new categories need to have the same number of items as the old
categories",
@@ -763,27 +695,6 @@ class CategoricalTest(ComparisonTestBase, TestUtils):
psser.cat.set_categories(["a", "c", "b", "d", "e"], ordered=True),
)
- self.assert_eq(
- pser.cat.set_categories(["a", "c", "b", "o"], inplace=True,
rename=True),
- psser.cat.set_categories(["a", "c", "b", "o"], inplace=True,
rename=True),
- )
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=["a", "c", "b",
"o"]))
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
- pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
- psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
- if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
- # Bug in pandas 1.3. dtype is not updated properly with `inplace`
argument.
- pser = pser.astype(CategoricalDtype(categories=[2, 3, 1, 0]))
- pdf.b = pser
-
- self.assert_eq(pser, psser)
- self.assert_eq(pdf, psdf)
-
self.assertRaisesRegex(
TypeError,
"Parameter 'new_categories' must be list-like, was",
diff --git a/python/pyspark/pandas/window.py b/python/pyspark/pandas/window.py
index 211bef3678c..316a4af92dd 100644
--- a/python/pyspark/pandas/window.py
+++ b/python/pyspark/pandas/window.py
@@ -1440,7 +1440,7 @@ class Expanding(ExpandingLike[FrameLike]):
return partial(property_or_func, self)
raise AttributeError(item)
- # TODO: when add 'center' and 'axis' parameter, should add to here too.
+ # TODO: when add 'axis' parameter, should add to here too.
def __repr__(self) -> str:
return "Expanding [min_periods={}]".format(self._min_periods)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]