This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new eba37d2d408b [SPARK-45718][PS] Remove remaining deprecated Pandas
features from Spark 3.4.0
eba37d2d408b is described below
commit eba37d2d408ba21e849c1a945a6620b66cc299a9
Author: Haejoon Lee <[email protected]>
AuthorDate: Thu Nov 2 14:22:37 2023 -0700
[SPARK-45718][PS] Remove remaining deprecated Pandas features from Spark
3.4.0
### What changes were proposed in this pull request?
This PR proposes to remove remaining deprecated Pandas features from Spark
3.4.0
### Why are the changes needed?
To match the behavior of Pandas. We cleaned up most of APIs, but there are
still some features that deprecated from Spark 3.4.0 need to be removed.
### Does this PR introduce _any_ user-facing change?
Yes, some parameters and APIs are removed from Spark 4.0.0.
### How was this patch tested?
The existing CI should pass.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43581 from itholic/SPARK-45718.
Authored-by: Haejoon Lee <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../source/migration_guide/pyspark_upgrade.rst | 9 +++++
.../source/reference/pyspark.pandas/groupby.rst | 1 -
.../source/reference/pyspark.pandas/indexing.rst | 2 --
python/pyspark/pandas/generic.py | 13 -------
python/pyspark/pandas/groupby.py | 28 ---------------
python/pyspark/pandas/indexes/base.py | 41 ----------------------
python/pyspark/pandas/indexes/multi.py | 22 ------------
python/pyspark/pandas/namespace.py | 29 ---------------
python/pyspark/pandas/tests/test_csv.py | 5 ---
9 files changed, 9 insertions(+), 141 deletions(-)
diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index 20fab5785046..06991281bf07 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -54,6 +54,15 @@ Upgrading from PySpark 3.5 to 4.0
* In Spark 4.0, ``DataFrame.to_spark_io`` has been removed from pandas API on
Spark, use ``DataFrame.spark.to_spark_io`` instead.
* In Spark 4.0, ``Series.is_monotonic`` and ``Index.is_monotonic`` have been
removed from pandas API on Spark, use ``Series.is_monotonic_increasing`` or
``Index.is_monotonic_increasing`` instead respectively.
* In Spark 4.0, ``DataFrame.get_dtype_counts`` has been removed from pandas
API on Spark, use ``DataFrame.dtypes.value_counts()`` instead.
+* In Spark 4.0, ``encoding`` parameter from ``DataFrame.to_excel`` and
``Series.to_excel`` have been removed from pandas API on Spark.
+* In Spark 4.0, ``verbose`` parameter from ``DataFrame.to_excel`` and
``Series.to_excel`` have been removed from pandas API on Spark.
+* In Spark 4.0, ``mangle_dupe_cols`` parameter from ``read_csv`` has been
removed from pandas API on Spark.
+* In Spark 4.0, ``DataFrameGroupBy.backfill`` has been removed from pandas API
on Spark, use ``DataFrameGroupBy.bfill`` instead.
+* In Spark 4.0, ``DataFrameGroupBy.pad`` has been removed from pandas API on
Spark, use ``DataFrameGroupBy.ffill`` instead.
+* In Spark 4.0, ``Index.is_all_dates`` has been removed from pandas API on
Spark.
+* In Spark 4.0, ``convert_float`` parameter from ``read_excel`` has been
removed from pandas API on Spark.
+* In Spark 4.0, ``mangle_dupe_cols`` parameter from ``read_excel`` has been
removed from pandas API on Spark.
+
Upgrading from PySpark 3.3 to 3.4
diff --git a/python/docs/source/reference/pyspark.pandas/groupby.rst
b/python/docs/source/reference/pyspark.pandas/groupby.rst
index e71e81c56dd3..7a0c771e8caa 100644
--- a/python/docs/source/reference/pyspark.pandas/groupby.rst
+++ b/python/docs/source/reference/pyspark.pandas/groupby.rst
@@ -89,7 +89,6 @@ Computations / Descriptive Stats
GroupBy.bfill
GroupBy.ffill
GroupBy.head
- GroupBy.backfill
GroupBy.shift
GroupBy.tail
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst
b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 08f5e224e06e..71584892ca38 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -43,7 +43,6 @@ Properties
Index.hasnans
Index.dtype
Index.inferred_type
- Index.is_all_dates
Index.shape
Index.name
Index.names
@@ -219,7 +218,6 @@ MultiIndex Properties
MultiIndex.has_duplicates
MultiIndex.hasnans
MultiIndex.inferred_type
- MultiIndex.is_all_dates
MultiIndex.shape
MultiIndex.names
MultiIndex.ndim
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index 16eaeb6142e5..231397628822 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -990,9 +990,7 @@ class Frame(object, metaclass=ABCMeta):
startcol: int = 0,
engine: Optional[str] = None,
merge_cells: bool = True,
- encoding: Optional[str] = None,
inf_rep: str = "inf",
- verbose: bool = True,
freeze_panes: Optional[Tuple[int, int]] = None,
) -> None:
"""
@@ -1043,20 +1041,9 @@ class Frame(object, metaclass=ABCMeta):
``io.excel.xlsm.writer``.
merge_cells: bool, default True
Write MultiIndex and Hierarchical Rows as merged cells.
- encoding: str, optional
- Encoding of the resulting excel file. Only necessary for xlwt,
- other writers support unicode natively.
-
- .. deprecated:: 3.4.0
-
inf_rep: str, default 'inf'
Representation for infinity (there is no native representation for
infinity in Excel).
- verbose: bool, default True
- Display more information in the error logs.
-
- .. deprecated:: 3.4.0
-
freeze_panes: tuple of int (length 2), optional
Specifies the one-based bottommost row and rightmost column that
is to be frozen.
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index b19a40b837a0..4cce147b2606 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -2614,20 +2614,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
"""
return self.fillna(method="bfill", limit=limit)
- def backfill(self, limit: Optional[int] = None) -> FrameLike:
- """
- Alias for bfill.
-
- .. deprecated:: 3.4.0
- """
- warnings.warn(
- "The GroupBy.backfill method is deprecated "
- "and will be removed in a future version. "
- "Use GroupBy.bfill instead.",
- FutureWarning,
- )
- return self.bfill(limit=limit)
-
def ffill(self, limit: Optional[int] = None) -> FrameLike:
"""
Synonym for `DataFrame.fillna()` with ``method=`ffill```.
@@ -2677,20 +2663,6 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
"""
return self.fillna(method="ffill", limit=limit)
- def pad(self, limit: Optional[int] = None) -> FrameLike:
- """
- Alias for ffill.
-
- .. deprecated:: 3.4.0
- """
- warnings.warn(
- "The GroupBy.pad method is deprecated "
- "and will be removed in a future version. "
- "Use GroupBy.ffill instead.",
- FutureWarning,
- )
- return self.ffill(limit=limit)
-
def _limit(self, n: int, asc: bool) -> FrameLike:
"""
Private function for tail and head.
diff --git a/python/pyspark/pandas/indexes/base.py
b/python/pyspark/pandas/indexes/base.py
index 2ec0a39dc713..6c6ee9ae0d7d 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -2118,47 +2118,6 @@ class Index(IndexOpsMixin):
result.name = self.name
return result if sort is None else cast(Index, result.sort_values())
- @property
- def is_all_dates(self) -> bool:
- """
- Return if all data types of the index are datetime.
- remember that since pandas-on-Spark does not support multiple data
types in an index,
- so it returns True if any type of data is datetime.
-
- .. deprecated:: 3.4.0
-
- Examples
- --------
- >>> from datetime import datetime
-
- >>> idx = ps.Index([datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 2,
3, 0, 0, 0)])
- >>> idx
- DatetimeIndex(['2019-01-01', '2019-02-03'], dtype='datetime64[ns]',
freq=None)
-
- >>> idx.is_all_dates
- True
-
- >>> idx = ps.Index([datetime(2019, 1, 1, 0, 0, 0), None])
- >>> idx
- DatetimeIndex(['2019-01-01', 'NaT'], dtype='datetime64[ns]', freq=None)
-
- >>> idx.is_all_dates
- True
-
- >>> idx = ps.Index([0, 1, 2])
- >>> idx
- Index([0, 1, 2], dtype='int64')
-
- >>> idx.is_all_dates
- False
- """
- warnings.warn(
- "Index.is_all_dates is deprecated, will be removed in a future
version. "
- "check index.inferred_type instead",
- FutureWarning,
- )
- return isinstance(self.spark.data_type, (TimestampType,
TimestampNTZType))
-
def repeat(self, repeats: int) -> "Index":
"""
Repeat elements of a Index/MultiIndex.
diff --git a/python/pyspark/pandas/indexes/multi.py
b/python/pyspark/pandas/indexes/multi.py
index 9917a42fb385..41c3b93ed51b 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -975,28 +975,6 @@ class MultiIndex(Index):
"only the default get_loc method is currently supported for
MultiIndex"
)
- @property
- def is_all_dates(self) -> bool:
- """
- is_all_dates always returns False for MultiIndex
-
- Examples
- --------
- >>> from datetime import datetime
-
- >>> idx = ps.MultiIndex.from_tuples(
- ... [(datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 1, 0, 0,
0)),
- ... (datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 1, 0, 0,
0))])
- >>> idx # doctest: +SKIP
- MultiIndex([('2019-01-01', '2019-01-01'),
- ('2019-01-01', '2019-01-01')],
- )
-
- >>> idx.is_all_dates
- False
- """
- return False
-
def __getattr__(self, item: str) -> Any:
if hasattr(MissingPandasLikeMultiIndex, item):
property_or_func = getattr(MissingPandasLikeMultiIndex, item)
diff --git a/python/pyspark/pandas/namespace.py
b/python/pyspark/pandas/namespace.py
index 9b64300e948f..aa9374b6dceb 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -222,7 +222,6 @@ def read_csv(
names: Optional[Union[str, List[str]]] = None,
index_col: Optional[Union[str, List[str]]] = None,
usecols: Optional[Union[List[int], List[str], Callable[[str], bool]]] =
None,
- mangle_dupe_cols: bool = True,
dtype: Optional[Union[str, Dtype, Dict[str, Union[str, Dtype]]]] = None,
nrows: Optional[int] = None,
parse_dates: bool = False,
@@ -261,14 +260,6 @@ def read_csv(
from the document header row(s).
If callable, the callable function will be evaluated against the
column names,
returning names where the callable function evaluates to `True`.
- mangle_dupe_cols : bool, default True
- Duplicate columns will be specified as 'X0', 'X1', ... 'XN', rather
- than 'X' ... 'X'. Passing in False will cause data to be overwritten if
- there are duplicate names in the columns.
- Currently only `True` is allowed.
-
- .. deprecated:: 3.4.0
-
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32}
Use str or object
together with suitable na_values settings to preserve and not
interpret dtype.
@@ -310,8 +301,6 @@ def read_csv(
if "options" in options and isinstance(options.get("options"), dict) and
len(options) == 1:
options = options.get("options")
- if mangle_dupe_cols is not True:
- raise ValueError("mangle_dupe_cols can only be `True`: %s" %
mangle_dupe_cols)
if parse_dates is not False:
raise ValueError("parse_dates can only be `False`: %s" % parse_dates)
@@ -917,8 +906,6 @@ def read_excel(
thousands: Optional[str] = None,
comment: Optional[str] = None,
skipfooter: int = 0,
- convert_float: bool = True,
- mangle_dupe_cols: bool = True,
**kwds: Any,
) -> Union[DataFrame, Series, Dict[str, Union[DataFrame, Series]]]:
"""
@@ -1041,20 +1028,6 @@ def read_excel(
comment string and the end of the current line is ignored.
skipfooter : int, default 0
Rows at the end to skip (0-indexed).
- convert_float : bool, default True
- Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
- data will be read in as floats: Excel stores all numbers as floats
- internally.
-
- .. deprecated:: 3.4.0
-
- mangle_dupe_cols : bool, default True
- Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather
than
- 'X'...'X'. Passing in False will cause data to be overwritten if there
- are duplicate names in the columns.
-
- .. deprecated:: 3.4.0
-
**kwds : optional
Optional keyword arguments can be passed to ``TextFileReader``.
@@ -1150,8 +1123,6 @@ def read_excel(
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
- convert_float=convert_float,
- mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)
diff --git a/python/pyspark/pandas/tests/test_csv.py
b/python/pyspark/pandas/tests/test_csv.py
index a62388050472..e35b49315712 100644
--- a/python/pyspark/pandas/tests/test_csv.py
+++ b/python/pyspark/pandas/tests/test_csv.py
@@ -254,11 +254,6 @@ class CsvTestsMixin:
actual = ps.read_csv(fn, sep="\t")
self.assert_eq(expected, actual, almost=True)
- def test_read_csv_with_mangle_dupe_cols(self):
- self.assertRaisesRegex(
- ValueError, "mangle_dupe_cols", lambda: ps.read_csv("path",
mangle_dupe_cols=False)
- )
-
def test_read_csv_with_parse_dates(self):
self.assertRaisesRegex(
ValueError, "parse_dates", lambda: ps.read_csv("path",
parse_dates=True)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]