This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 8fb1d2da1059 [SPARK-55408][PS] Handle unexpected keyword argument 
errors related to datetime with pandas 3
8fb1d2da1059 is described below

commit 8fb1d2da10591176fabb5a216ade876348216729
Author: Takuya Ueshin <[email protected]>
AuthorDate: Mon Feb 9 08:22:42 2026 +0800

    [SPARK-55408][PS] Handle unexpected keyword argument errors related to 
datetime with pandas 3
    
    ### What changes were proposed in this pull request?
    
    Handles an unexpected keyword argument error related to datetime with 
pandas 3.
    
    ### Why are the changes needed?
    
    There are removed arguments in datetime-related functions.
    
    - the constructor of `DatetimeIndex`: `normalize`, `closed`
    - `to_datetime`: `infer_datetime_format`
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it will behave more like pandas 3.
    
    ### How was this patch tested?
    
    Updated a related test and the other existing tests should pass.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #54191 from ueshin/issues/SPARK-55408/kwargs.
    
    Authored-by: Takuya Ueshin <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/pandas/indexes/datetimes.py         | 60 +++++++++++++---------
 python/pyspark/pandas/indexes/timedelta.py         |  4 +-
 python/pyspark/pandas/namespace.py                 | 37 +++++++------
 .../pyspark/pandas/tests/series/test_conversion.py | 11 ++--
 4 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/python/pyspark/pandas/indexes/datetimes.py 
b/python/pyspark/pandas/indexes/datetimes.py
index 772cb585f242..cd90e49dc7ee 100644
--- a/python/pyspark/pandas/indexes/datetimes.py
+++ b/python/pyspark/pandas/indexes/datetimes.py
@@ -24,6 +24,7 @@ from pandas.api.types import is_hashable
 from pandas.tseries.offsets import DateOffset
 from pyspark._globals import _NoValue
 
+from pyspark.loose_version import LooseVersion
 from pyspark import pandas as ps
 from pyspark.pandas import DataFrame
 from pyspark.pandas.indexes.base import Index
@@ -109,8 +110,8 @@ class DatetimeIndex(Index):
         cls,
         data=None,
         freq=_NoValue,
-        normalize=False,
-        closed=None,
+        normalize=_NoValue,
+        closed=_NoValue,
         ambiguous="raise",
         dayfirst=False,
         yearfirst=False,
@@ -118,30 +119,8 @@ class DatetimeIndex(Index):
         copy=False,
         name=None,
     ) -> "DatetimeIndex":
-        if closed is not None:
-            warnings.warn(
-                "The 'closed' keyword in DatetimeIndex construction is 
deprecated "
-                "and will be removed in a future version.",
-                FutureWarning,
-            )
-        if normalize is not None:
-            warnings.warn(
-                "The 'normalize' keyword in DatetimeIndex construction is 
deprecated "
-                "and will be removed in a future version.",
-                FutureWarning,
-            )
-        if not is_hashable(name):
-            raise TypeError("Index.name must be a hashable type")
-
-        if isinstance(data, (Series, Index)):
-            if dtype is None:
-                dtype = "datetime64[ns]"
-            return cast(DatetimeIndex, Index(data, dtype=dtype, copy=copy, 
name=name))
-
         kwargs = dict(
             data=data,
-            normalize=normalize,
-            closed=closed,
             ambiguous=ambiguous,
             dayfirst=dayfirst,
             yearfirst=yearfirst,
@@ -152,6 +131,39 @@ class DatetimeIndex(Index):
         if freq is not _NoValue:
             kwargs["freq"] = freq
 
+        if LooseVersion(pd.__version__) < "3.0.0":
+            if normalize is not _NoValue:
+                warnings.warn(
+                    "The 'normalize' keyword in DatetimeIndex construction is 
deprecated "
+                    "and will be removed in a future version.",
+                    FutureWarning,
+                )
+                kwargs["normalize"] = normalize
+            else:
+                kwargs["normalize"] = False
+            if closed is not _NoValue:
+                warnings.warn(
+                    "The 'closed' keyword in DatetimeIndex construction is 
deprecated "
+                    "and will be removed in a future version.",
+                    FutureWarning,
+                )
+                kwargs["closed"] = closed
+        else:
+            if normalize is not _NoValue:
+                raise TypeError(
+                    "The 'normalize' keyword is not supported in pandas 3.0.0 
and later."
+                )
+            if closed is not _NoValue:
+                raise TypeError("The 'closed' keyword is not supported in 
pandas 3.0.0 and later.")
+
+        if not is_hashable(name):
+            raise TypeError("Index.name must be a hashable type")
+
+        if isinstance(data, (Series, Index)):
+            if dtype is None:
+                dtype = "datetime64[ns]"
+            return cast(DatetimeIndex, Index(data, dtype=dtype, copy=copy, 
name=name))
+
         return cast(DatetimeIndex, ps.from_pandas(pd.DatetimeIndex(**kwargs)))
 
     def __getattr__(self, item: str) -> Any:
diff --git a/python/pyspark/pandas/indexes/timedelta.py 
b/python/pyspark/pandas/indexes/timedelta.py
index 2138226d480f..112d2bda0688 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -133,10 +133,10 @@ class TimedeltaIndex(Index):
                 kwargs["closed"] = closed
         else:
             if unit is not _NoValue:
-                raise ValueError("The 'unit' keyword is not supported in 
pandas 3.0.0 and later.")
+                raise TypeError("The 'unit' keyword is not supported in pandas 
3.0.0 and later.")
 
             if closed is not _NoValue:
-                raise ValueError("The 'closed' keyword is not supported in 
pandas 3.0.0 and later.")
+                raise TypeError("The 'closed' keyword is not supported in 
pandas 3.0.0 and later.")
 
         return cast(TimedeltaIndex, 
ps.from_pandas(pd.TimedeltaIndex(**kwargs)))
 
diff --git a/python/pyspark/pandas/namespace.py 
b/python/pyspark/pandas/namespace.py
index fd87fddb2678..bd45c008a04d 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -1607,7 +1607,7 @@ def to_datetime(
     errors: str = "raise",
     format: Optional[str] = None,
     unit: Optional[str] = None,
-    infer_datetime_format: bool = False,
+    infer_datetime_format: Union[bool, _NoValueType] = _NoValue,
     origin: str = "unix",
 ):
     """
@@ -1747,19 +1747,29 @@ def to_datetime(
         "microseconds": "us",
     }
 
+    kwargs = dict(
+        errors=errors,
+        format=format,
+        unit=unit,
+        origin=origin,
+    )
+
+    if LooseVersion(pd.__version__) < "3.0.0":
+        kwargs["infer_datetime_format"] = (
+            infer_datetime_format if infer_datetime_format is not _NoValue 
else False
+        )
+    else:
+        if infer_datetime_format is not _NoValue:
+            raise TypeError(
+                "The 'infer_datetime_format' keyword is not supported in 
pandas 3.0.0 and later."
+            )
+
     def pandas_to_datetime(
         pser_or_pdf: Union[pd.DataFrame, pd.Series], cols: Optional[List[str]] 
= None
     ) -> Series[np.datetime64]:
         if isinstance(pser_or_pdf, pd.DataFrame):
             pser_or_pdf = pser_or_pdf[cols]
-        return pd.to_datetime(
-            pser_or_pdf,
-            errors=errors,
-            format=format,
-            unit=unit,
-            infer_datetime_format=infer_datetime_format,
-            origin=origin,
-        )
+        return pd.to_datetime(pser_or_pdf, **kwargs)
 
     if isinstance(arg, Series):
         return arg.pandas_on_spark.transform_batch(pandas_to_datetime)
@@ -1774,14 +1784,7 @@ def to_datetime(
 
         psdf = arg[list_cols]
         return psdf.pandas_on_spark.transform_batch(pandas_to_datetime, 
list_cols)
-    return pd.to_datetime(
-        arg,
-        errors=errors,
-        format=format,
-        unit=unit,
-        infer_datetime_format=infer_datetime_format,
-        origin=origin,
-    )
+    return pd.to_datetime(arg, **kwargs)
 
 
 def date_range(
diff --git a/python/pyspark/pandas/tests/series/test_conversion.py 
b/python/pyspark/pandas/tests/series/test_conversion.py
index fa7ddba913e8..2fa038c48d3f 100644
--- a/python/pyspark/pandas/tests/series/test_conversion.py
+++ b/python/pyspark/pandas/tests/series/test_conversion.py
@@ -18,6 +18,7 @@ import unittest
 
 import pandas as pd
 
+from pyspark.loose_version import LooseVersion
 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
@@ -43,10 +44,12 @@ class SeriesConversionMixin:
         pser = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 100)
         psser = ps.from_pandas(pser)
 
-        self.assert_eq(
-            pd.to_datetime(pser, infer_datetime_format=True),
-            ps.to_datetime(psser, infer_datetime_format=True),
-        )
+        self.assert_eq(pd.to_datetime(pser), ps.to_datetime(psser))
+        if LooseVersion(pd.__version__) < "3.0.0":
+            self.assert_eq(
+                pd.to_datetime(pser, infer_datetime_format=True),
+                ps.to_datetime(psser, infer_datetime_format=True),
+            )
 
     def test_to_list(self):
         self.assert_eq(self.psser.tolist(), self.pser.tolist())


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to