This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8fb1d2da1059 [SPARK-55408][PS] Handle unexpected keyword argument
errors related to datetime with pandas 3
8fb1d2da1059 is described below
commit 8fb1d2da10591176fabb5a216ade876348216729
Author: Takuya Ueshin <[email protected]>
AuthorDate: Mon Feb 9 08:22:42 2026 +0800
[SPARK-55408][PS] Handle unexpected keyword argument errors related to
datetime with pandas 3
### What changes were proposed in this pull request?
Handles an unexpected keyword argument error related to datetime with
pandas 3.
### Why are the changes needed?
There are removed arguments in datetime-related functions.
- the constructor of `DatetimeIndex`: `normalize`, `closed`
- `to_datetime`: `infer_datetime_format`
### Does this PR introduce _any_ user-facing change?
Yes, it will behave more like pandas 3.
### How was this patch tested?
Updated a related test and the other existing tests should pass.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54191 from ueshin/issues/SPARK-55408/kwargs.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/indexes/datetimes.py | 60 +++++++++++++---------
python/pyspark/pandas/indexes/timedelta.py | 4 +-
python/pyspark/pandas/namespace.py | 37 +++++++------
.../pyspark/pandas/tests/series/test_conversion.py | 11 ++--
4 files changed, 65 insertions(+), 47 deletions(-)
diff --git a/python/pyspark/pandas/indexes/datetimes.py
b/python/pyspark/pandas/indexes/datetimes.py
index 772cb585f242..cd90e49dc7ee 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -24,6 +24,7 @@ from pandas.api.types import is_hashable
from pandas.tseries.offsets import DateOffset
from pyspark._globals import _NoValue
+from pyspark.loose_version import LooseVersion
from pyspark import pandas as ps
from pyspark.pandas import DataFrame
from pyspark.pandas.indexes.base import Index
@@ -109,8 +110,8 @@ class DatetimeIndex(Index):
cls,
data=None,
freq=_NoValue,
- normalize=False,
- closed=None,
+ normalize=_NoValue,
+ closed=_NoValue,
ambiguous="raise",
dayfirst=False,
yearfirst=False,
@@ -118,30 +119,8 @@ class DatetimeIndex(Index):
copy=False,
name=None,
) -> "DatetimeIndex":
- if closed is not None:
- warnings.warn(
- "The 'closed' keyword in DatetimeIndex construction is
deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
- if normalize is not None:
- warnings.warn(
- "The 'normalize' keyword in DatetimeIndex construction is
deprecated "
- "and will be removed in a future version.",
- FutureWarning,
- )
- if not is_hashable(name):
- raise TypeError("Index.name must be a hashable type")
-
- if isinstance(data, (Series, Index)):
- if dtype is None:
- dtype = "datetime64[ns]"
- return cast(DatetimeIndex, Index(data, dtype=dtype, copy=copy,
name=name))
-
kwargs = dict(
data=data,
- normalize=normalize,
- closed=closed,
ambiguous=ambiguous,
dayfirst=dayfirst,
yearfirst=yearfirst,
@@ -152,6 +131,39 @@ class DatetimeIndex(Index):
if freq is not _NoValue:
kwargs["freq"] = freq
+ if LooseVersion(pd.__version__) < "3.0.0":
+ if normalize is not _NoValue:
+ warnings.warn(
+ "The 'normalize' keyword in DatetimeIndex construction is
deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+ kwargs["normalize"] = normalize
+ else:
+ kwargs["normalize"] = False
+ if closed is not _NoValue:
+ warnings.warn(
+ "The 'closed' keyword in DatetimeIndex construction is
deprecated "
+ "and will be removed in a future version.",
+ FutureWarning,
+ )
+ kwargs["closed"] = closed
+ else:
+ if normalize is not _NoValue:
+ raise TypeError(
+ "The 'normalize' keyword is not supported in pandas 3.0.0
and later."
+ )
+ if closed is not _NoValue:
+ raise TypeError("The 'closed' keyword is not supported in
pandas 3.0.0 and later.")
+
+ if not is_hashable(name):
+ raise TypeError("Index.name must be a hashable type")
+
+ if isinstance(data, (Series, Index)):
+ if dtype is None:
+ dtype = "datetime64[ns]"
+ return cast(DatetimeIndex, Index(data, dtype=dtype, copy=copy,
name=name))
+
return cast(DatetimeIndex, ps.from_pandas(pd.DatetimeIndex(**kwargs)))
def __getattr__(self, item: str) -> Any:
diff --git a/python/pyspark/pandas/indexes/timedelta.py
b/python/pyspark/pandas/indexes/timedelta.py
index 2138226d480f..112d2bda0688 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -133,10 +133,10 @@ class TimedeltaIndex(Index):
kwargs["closed"] = closed
else:
if unit is not _NoValue:
- raise ValueError("The 'unit' keyword is not supported in
pandas 3.0.0 and later.")
+ raise TypeError("The 'unit' keyword is not supported in pandas
3.0.0 and later.")
if closed is not _NoValue:
- raise ValueError("The 'closed' keyword is not supported in
pandas 3.0.0 and later.")
+ raise TypeError("The 'closed' keyword is not supported in
pandas 3.0.0 and later.")
return cast(TimedeltaIndex,
ps.from_pandas(pd.TimedeltaIndex(**kwargs)))
diff --git a/python/pyspark/pandas/namespace.py
b/python/pyspark/pandas/namespace.py
index fd87fddb2678..bd45c008a04d 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -1607,7 +1607,7 @@ def to_datetime(
errors: str = "raise",
format: Optional[str] = None,
unit: Optional[str] = None,
- infer_datetime_format: bool = False,
+ infer_datetime_format: Union[bool, _NoValueType] = _NoValue,
origin: str = "unix",
):
"""
@@ -1747,19 +1747,29 @@ def to_datetime(
"microseconds": "us",
}
+ kwargs = dict(
+ errors=errors,
+ format=format,
+ unit=unit,
+ origin=origin,
+ )
+
+ if LooseVersion(pd.__version__) < "3.0.0":
+ kwargs["infer_datetime_format"] = (
+ infer_datetime_format if infer_datetime_format is not _NoValue
else False
+ )
+ else:
+ if infer_datetime_format is not _NoValue:
+ raise TypeError(
+ "The 'infer_datetime_format' keyword is not supported in
pandas 3.0.0 and later."
+ )
+
def pandas_to_datetime(
pser_or_pdf: Union[pd.DataFrame, pd.Series], cols: Optional[List[str]]
= None
) -> Series[np.datetime64]:
if isinstance(pser_or_pdf, pd.DataFrame):
pser_or_pdf = pser_or_pdf[cols]
- return pd.to_datetime(
- pser_or_pdf,
- errors=errors,
- format=format,
- unit=unit,
- infer_datetime_format=infer_datetime_format,
- origin=origin,
- )
+ return pd.to_datetime(pser_or_pdf, **kwargs)
if isinstance(arg, Series):
return arg.pandas_on_spark.transform_batch(pandas_to_datetime)
@@ -1774,14 +1784,7 @@ def to_datetime(
psdf = arg[list_cols]
return psdf.pandas_on_spark.transform_batch(pandas_to_datetime,
list_cols)
- return pd.to_datetime(
- arg,
- errors=errors,
- format=format,
- unit=unit,
- infer_datetime_format=infer_datetime_format,
- origin=origin,
- )
+ return pd.to_datetime(arg, **kwargs)
def date_range(
diff --git a/python/pyspark/pandas/tests/series/test_conversion.py
b/python/pyspark/pandas/tests/series/test_conversion.py
index fa7ddba913e8..2fa038c48d3f 100644
--- a/python/pyspark/pandas/tests/series/test_conversion.py
+++ b/python/pyspark/pandas/tests/series/test_conversion.py
@@ -18,6 +18,7 @@ import unittest
import pandas as pd
+from pyspark.loose_version import LooseVersion
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -43,10 +44,12 @@ class SeriesConversionMixin:
pser = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 100)
psser = ps.from_pandas(pser)
- self.assert_eq(
- pd.to_datetime(pser, infer_datetime_format=True),
- ps.to_datetime(psser, infer_datetime_format=True),
- )
+ self.assert_eq(pd.to_datetime(pser), ps.to_datetime(psser))
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assert_eq(
+ pd.to_datetime(pser, infer_datetime_format=True),
+ ps.to_datetime(psser, infer_datetime_format=True),
+ )
def test_to_list(self):
self.assert_eq(self.psser.tolist(), self.pser.tolist())
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]