[spark] branch master updated: [SPARK-43245][SPARK-43705][PS] Type match for `DatetimeIndex`/`TimedeltaIndex` with pandas 2

gurwls223 Thu, 10 Aug 2023 01:42:47 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 2c9b25a90b5 [SPARK-43245][SPARK-43705][PS] Type match for 
`DatetimeIndex`/`TimedeltaIndex` with pandas 2
2c9b25a90b5 is described below

commit 2c9b25a90b57ca2095881f9de4f13bf820f9dac9
Author: itholic <haejoon....@databricks.com>
AuthorDate: Thu Aug 10 17:42:21 2023 +0900

    [SPARK-43245][SPARK-43705][PS] Type match for 
`DatetimeIndex`/`TimedeltaIndex` with pandas 2
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to match the type for `DatetimeIndex`/`TimedeltaIndex` 
with [pandas 2](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html).
    
    ### Why are the changes needed?
    
    To match the behavior with pandas 2 and above.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the return type for several DatatimeIndex & TimedeltaIndex APIs is 
changed to `int32` instead of `int64`. e.g.
    
    ```diff
    >>> s = ps.from_pandas(pd.date_range('2016-12-31', '2017-01-08', 
freq='D').to_series())
    >>> s.dt.dayofweek
    2016-12-31    5
    2017-01-01    6
    2017-01-02    0
    2017-01-03    1
    2017-01-04    2
    2017-01-05    3
    2017-01-06    4
    2017-01-07    5
    2017-01-08    6
    -  dtype: int64
    +  dtype: int32
    ```
    
    ### How was this patch tested?
    
    Enabling the existing doctests & UTs.
    
    Closes #42271 from itholic/pandas_datetime_api.
    
    Authored-by: itholic <haejoon....@databricks.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 .../source/migration_guide/pyspark_upgrade.rst     |  1 +
 python/pyspark/pandas/datetimes.py                 | 30 +++++++++++-----------
 python/pyspark/pandas/indexes/timedelta.py         |  3 ++-
 .../pyspark/pandas/tests/indexes/test_datetime.py  | 28 +++++++++++++++-----
 .../pyspark/pandas/tests/indexes/test_timedelta.py |  4 ---
 5 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst 
b/python/docs/source/migration_guide/pyspark_upgrade.rst
index da49719579a..98630133e0c 100644
--- a/python/docs/source/migration_guide/pyspark_upgrade.rst
+++ b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -34,6 +34,7 @@ Upgrading from PySpark 3.5 to 4.0
 * In Spark 4.0, ``closed`` parameter from ``ps.date_range`` has been removed 
from pandas API on Spark.
 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from 
``DataFrame.between_time`` have been removed from pandas API on Spark, use 
``inclusive`` instead.
 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from 
``Series.between_time`` have been removed from pandas API on Spark, use 
``inclusive`` instead.
+* In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, 
``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas 
API on Spark.
 
 
 Upgrading from PySpark 3.3 to 3.4
diff --git a/python/pyspark/pandas/datetimes.py 
b/python/pyspark/pandas/datetimes.py
index 752f6f46282..b0649cf5761 100644
--- a/python/pyspark/pandas/datetimes.py
+++ b/python/pyspark/pandas/datetimes.py
@@ -27,7 +27,7 @@ from pandas.tseries.offsets import DateOffset
 
 import pyspark.pandas as ps
 import pyspark.sql.functions as F
-from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, 
LongType
+from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, 
LongType, IntegerType
 
 
 class DatetimeMethods:
@@ -64,42 +64,42 @@ class DatetimeMethods:
         """
         The year of the datetime.
         """
-        return self._data.spark.transform(lambda c: F.year(c).cast(LongType()))
+        return self._data.spark.transform(lambda c: 
F.year(c).cast(IntegerType()))
 
     @property
     def month(self) -> "ps.Series":
         """
         The month of the timestamp as January = 1 December = 12.
         """
-        return self._data.spark.transform(lambda c: 
F.month(c).cast(LongType()))
+        return self._data.spark.transform(lambda c: 
F.month(c).cast(IntegerType()))
 
     @property
     def day(self) -> "ps.Series":
         """
         The days of the datetime.
         """
-        return self._data.spark.transform(lambda c: 
F.dayofmonth(c).cast(LongType()))
+        return self._data.spark.transform(lambda c: 
F.dayofmonth(c).cast(IntegerType()))
 
     @property
     def hour(self) -> "ps.Series":
         """
         The hours of the datetime.
         """
-        return self._data.spark.transform(lambda c: F.hour(c).cast(LongType()))
+        return self._data.spark.transform(lambda c: 
F.hour(c).cast(IntegerType()))
 
     @property
     def minute(self) -> "ps.Series":
         """
         The minutes of the datetime.
         """
-        return self._data.spark.transform(lambda c: 
F.minute(c).cast(LongType()))
+        return self._data.spark.transform(lambda c: 
F.minute(c).cast(IntegerType()))
 
     @property
     def second(self) -> "ps.Series":
         """
         The seconds of the datetime.
         """
-        return self._data.spark.transform(lambda c: 
F.second(c).cast(LongType()))
+        return self._data.spark.transform(lambda c: 
F.second(c).cast(IntegerType()))
 
     @property
     def microsecond(self) -> "ps.Series":
@@ -107,7 +107,7 @@ class DatetimeMethods:
         The microseconds of the datetime.
         """
 
-        def pandas_microsecond(s) -> ps.Series[np.int64]:  # type: 
ignore[no-untyped-def]
+        def pandas_microsecond(s) -> ps.Series[np.int32]:  # type: 
ignore[no-untyped-def]
             return s.dt.microsecond
 
         return self._data.pandas_on_spark.transform_batch(pandas_microsecond)
@@ -171,10 +171,10 @@ class DatetimeMethods:
         2017-01-06    4
         2017-01-07    5
         2017-01-08    6
-        dtype: int64
+        dtype: int32
         """
 
-        def pandas_dayofweek(s) -> ps.Series[np.int64]:  # type: 
ignore[no-untyped-def]
+        def pandas_dayofweek(s) -> ps.Series[np.int32]:  # type: 
ignore[no-untyped-def]
             return s.dt.dayofweek
 
         return self._data.pandas_on_spark.transform_batch(pandas_dayofweek)
@@ -191,7 +191,7 @@ class DatetimeMethods:
         The ordinal day of the year.
         """
 
-        def pandas_dayofyear(s) -> ps.Series[np.int64]:  # type: 
ignore[no-untyped-def]
+        def pandas_dayofyear(s) -> ps.Series[np.int32]:  # type: 
ignore[no-untyped-def]
             return s.dt.dayofyear
 
         return self._data.pandas_on_spark.transform_batch(pandas_dayofyear)
@@ -202,7 +202,7 @@ class DatetimeMethods:
         The quarter of the date.
         """
 
-        def pandas_quarter(s) -> ps.Series[np.int64]:  # type: 
ignore[no-untyped-def]
+        def pandas_quarter(s) -> ps.Series[np.int32]:  # type: 
ignore[no-untyped-def]
             return s.dt.quarter
 
         return self._data.pandas_on_spark.transform_batch(pandas_quarter)
@@ -320,7 +320,7 @@ class DatetimeMethods:
         1    1
         2    2
         3    2
-        Name: dates, dtype: int64
+        Name: dates, dtype: int32
 
         >>> df.dates.dt.is_quarter_start
         0    False
@@ -370,7 +370,7 @@ class DatetimeMethods:
         1    1
         2    2
         3    2
-        Name: dates, dtype: int64
+        Name: dates, dtype: int32
 
         >>> df.dates.dt.is_quarter_start
         0    False
@@ -508,7 +508,7 @@ class DatetimeMethods:
         The number of days in the month.
         """
 
-        def pandas_daysinmonth(s) -> ps.Series[np.int64]:  # type: 
ignore[no-untyped-def]
+        def pandas_daysinmonth(s) -> ps.Series[np.int32]:  # type: 
ignore[no-untyped-def]
             return s.dt.daysinmonth
 
         return self._data.pandas_on_spark.transform_batch(pandas_daysinmonth)
diff --git a/python/pyspark/pandas/indexes/timedelta.py 
b/python/pyspark/pandas/indexes/timedelta.py
index 36cbb7cf4e6..b99c78542e7 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -19,6 +19,7 @@ from functools import partial
 
 import pandas as pd
 from pandas.api.types import is_hashable  # type: ignore[attr-defined]
+import numpy as np
 
 from pyspark import pandas as ps
 from pyspark._globals import _NoValue
@@ -136,7 +137,7 @@ class TimedeltaIndex(Index):
         Number of days for each element.
         """
 
-        def pandas_days(x) -> int:  # type: ignore[no-untyped-def]
+        def pandas_days(x) -> np.int64:  # type: ignore[no-untyped-def]
             return x.days
 
         return Index(self.to_series().transform(pandas_days))
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py 
b/python/pyspark/pandas/tests/indexes/test_datetime.py
index 4fb3561de6a..f289ec23fc4 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -73,10 +73,6 @@ class DatetimeIndexTestsMixin:
         ):
             ps.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"]).all()
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43608): Enable DatetimeIndexTests.test_properties for 
pandas 2.0.0.",
-    )
     def test_properties(self):
         for psidx, pidx in self.idx_pairs:
             self.assert_eq(psidx.year, pidx.year)
@@ -86,8 +82,6 @@ class DatetimeIndexTestsMixin:
             self.assert_eq(psidx.minute, pidx.minute)
             self.assert_eq(psidx.second, pidx.second)
             self.assert_eq(psidx.microsecond, pidx.microsecond)
-            self.assert_eq(psidx.week, pidx.week)
-            self.assert_eq(psidx.weekofyear, pidx.weekofyear)
             self.assert_eq(psidx.dayofweek, pidx.dayofweek)
             self.assert_eq(psidx.weekday, pidx.weekday)
             self.assert_eq(psidx.dayofyear, pidx.dayofyear)
@@ -106,6 +100,28 @@ class DatetimeIndexTestsMixin:
                 self.assert_eq(psidx.day_of_year, pidx.day_of_year)
                 self.assert_eq(psidx.day_of_week, pidx.day_of_week)
 
+        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
+            # TODO(SPARK-42617): Support isocalendar.week and replace it.
+            expected_results = [
+                ps.Index([1]),
+                ps.Index([1, 1, 13]),
+                ps.Index([52, 52, 1]),
+                ps.Index([52, 52, 52]),
+                ps.Index([52, 52, 52]),
+                ps.Index([52, 52, 52]),
+                ps.Index([52, 52, 52]),
+                ps.Index([52, 52, 52]),
+                ps.Index([52, 1, 2]),
+                ps.Index([13, 26, 39]),
+            ]
+            for psidx, expected_result in zip(self.psidxs, expected_results):
+                self.assert_eq(psidx.week, expected_result)
+                self.assert_eq(psidx.weekofyear, expected_result)
+        else:
+            for psidx, pidx in self.idx_pairs:
+                self.assert_eq(psidx.week, pidx.week)
+                self.assert_eq(psidx.weekofyear, pidx.weekofyear)
+
     def test_ceil(self):
         for psidx, pidx in self.idx_pairs:
             for freq in self.fixed_freqs:
diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py 
b/python/pyspark/pandas/tests/indexes/test_timedelta.py
index a9bb93e65bd..5321f96eeab 100644
--- a/python/pyspark/pandas/tests/indexes/test_timedelta.py
+++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -98,10 +98,6 @@ class TimedeltaIndexTestsMixin:
         ):
             psidx.all()
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43705): Enable TimedeltaIndexTests.test_properties for 
pandas 2.0.0.",
-    )
     def test_properties(self):
         self.assert_eq(self.psidx.days, self.pidx.days)
         self.assert_eq(self.psidx.seconds, self.pidx.seconds)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-43245][SPARK-43705][PS] Type match for `DatetimeIndex`/`TimedeltaIndex` with pandas 2

Reply via email to