This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 2c9b25a90b5 [SPARK-43245][SPARK-43705][PS] Type match for `DatetimeIndex`/`TimedeltaIndex` with pandas 2 2c9b25a90b5 is described below commit 2c9b25a90b57ca2095881f9de4f13bf820f9dac9 Author: itholic <haejoon....@databricks.com> AuthorDate: Thu Aug 10 17:42:21 2023 +0900 [SPARK-43245][SPARK-43705][PS] Type match for `DatetimeIndex`/`TimedeltaIndex` with pandas 2 ### What changes were proposed in this pull request? This PR proposes to match the type for `DatetimeIndex`/`TimedeltaIndex` with [pandas 2](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html). ### Why are the changes needed? To match the behavior with pandas 2 and above. ### Does this PR introduce _any_ user-facing change? Yes, the return type for several DatatimeIndex & TimedeltaIndex APIs is changed to `int32` instead of `int64`. e.g. ```diff >>> s = ps.from_pandas(pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series()) >>> s.dt.dayofweek 2016-12-31 5 2017-01-01 6 2017-01-02 0 2017-01-03 1 2017-01-04 2 2017-01-05 3 2017-01-06 4 2017-01-07 5 2017-01-08 6 - dtype: int64 + dtype: int32 ``` ### How was this patch tested? Enabling the existing doctests & UTs. Closes #42271 from itholic/pandas_datetime_api. Authored-by: itholic <haejoon....@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../source/migration_guide/pyspark_upgrade.rst | 1 + python/pyspark/pandas/datetimes.py | 30 +++++++++++----------- python/pyspark/pandas/indexes/timedelta.py | 3 ++- .../pyspark/pandas/tests/indexes/test_datetime.py | 28 +++++++++++++++----- .../pyspark/pandas/tests/indexes/test_timedelta.py | 4 --- 5 files changed, 40 insertions(+), 26 deletions(-) diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst index da49719579a..98630133e0c 100644 --- a/python/docs/source/migration_guide/pyspark_upgrade.rst +++ b/python/docs/source/migration_guide/pyspark_upgrade.rst @@ -34,6 +34,7 @@ Upgrading from PySpark 3.5 to 4.0 * In Spark 4.0, ``closed`` parameter from ``ps.date_range`` has been removed from pandas API on Spark. * In Spark 4.0, ``include_start`` and ``include_end`` parameters from ``DataFrame.between_time`` have been removed from pandas API on Spark, use ``inclusive`` instead. * In Spark 4.0, ``include_start`` and ``include_end`` parameters from ``Series.between_time`` have been removed from pandas API on Spark, use ``inclusive`` instead. +* In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, ``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas API on Spark. Upgrading from PySpark 3.3 to 3.4 diff --git a/python/pyspark/pandas/datetimes.py b/python/pyspark/pandas/datetimes.py index 752f6f46282..b0649cf5761 100644 --- a/python/pyspark/pandas/datetimes.py +++ b/python/pyspark/pandas/datetimes.py @@ -27,7 +27,7 @@ from pandas.tseries.offsets import DateOffset import pyspark.pandas as ps import pyspark.sql.functions as F -from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, LongType +from pyspark.sql.types import DateType, TimestampType, TimestampNTZType, LongType, IntegerType class DatetimeMethods: @@ -64,42 +64,42 @@ class DatetimeMethods: """ The year of the datetime. """ - return self._data.spark.transform(lambda c: F.year(c).cast(LongType())) + return self._data.spark.transform(lambda c: F.year(c).cast(IntegerType())) @property def month(self) -> "ps.Series": """ The month of the timestamp as January = 1 December = 12. """ - return self._data.spark.transform(lambda c: F.month(c).cast(LongType())) + return self._data.spark.transform(lambda c: F.month(c).cast(IntegerType())) @property def day(self) -> "ps.Series": """ The days of the datetime. """ - return self._data.spark.transform(lambda c: F.dayofmonth(c).cast(LongType())) + return self._data.spark.transform(lambda c: F.dayofmonth(c).cast(IntegerType())) @property def hour(self) -> "ps.Series": """ The hours of the datetime. """ - return self._data.spark.transform(lambda c: F.hour(c).cast(LongType())) + return self._data.spark.transform(lambda c: F.hour(c).cast(IntegerType())) @property def minute(self) -> "ps.Series": """ The minutes of the datetime. """ - return self._data.spark.transform(lambda c: F.minute(c).cast(LongType())) + return self._data.spark.transform(lambda c: F.minute(c).cast(IntegerType())) @property def second(self) -> "ps.Series": """ The seconds of the datetime. """ - return self._data.spark.transform(lambda c: F.second(c).cast(LongType())) + return self._data.spark.transform(lambda c: F.second(c).cast(IntegerType())) @property def microsecond(self) -> "ps.Series": @@ -107,7 +107,7 @@ class DatetimeMethods: The microseconds of the datetime. """ - def pandas_microsecond(s) -> ps.Series[np.int64]: # type: ignore[no-untyped-def] + def pandas_microsecond(s) -> ps.Series[np.int32]: # type: ignore[no-untyped-def] return s.dt.microsecond return self._data.pandas_on_spark.transform_batch(pandas_microsecond) @@ -171,10 +171,10 @@ class DatetimeMethods: 2017-01-06 4 2017-01-07 5 2017-01-08 6 - dtype: int64 + dtype: int32 """ - def pandas_dayofweek(s) -> ps.Series[np.int64]: # type: ignore[no-untyped-def] + def pandas_dayofweek(s) -> ps.Series[np.int32]: # type: ignore[no-untyped-def] return s.dt.dayofweek return self._data.pandas_on_spark.transform_batch(pandas_dayofweek) @@ -191,7 +191,7 @@ class DatetimeMethods: The ordinal day of the year. """ - def pandas_dayofyear(s) -> ps.Series[np.int64]: # type: ignore[no-untyped-def] + def pandas_dayofyear(s) -> ps.Series[np.int32]: # type: ignore[no-untyped-def] return s.dt.dayofyear return self._data.pandas_on_spark.transform_batch(pandas_dayofyear) @@ -202,7 +202,7 @@ class DatetimeMethods: The quarter of the date. """ - def pandas_quarter(s) -> ps.Series[np.int64]: # type: ignore[no-untyped-def] + def pandas_quarter(s) -> ps.Series[np.int32]: # type: ignore[no-untyped-def] return s.dt.quarter return self._data.pandas_on_spark.transform_batch(pandas_quarter) @@ -320,7 +320,7 @@ class DatetimeMethods: 1 1 2 2 3 2 - Name: dates, dtype: int64 + Name: dates, dtype: int32 >>> df.dates.dt.is_quarter_start 0 False @@ -370,7 +370,7 @@ class DatetimeMethods: 1 1 2 2 3 2 - Name: dates, dtype: int64 + Name: dates, dtype: int32 >>> df.dates.dt.is_quarter_start 0 False @@ -508,7 +508,7 @@ class DatetimeMethods: The number of days in the month. """ - def pandas_daysinmonth(s) -> ps.Series[np.int64]: # type: ignore[no-untyped-def] + def pandas_daysinmonth(s) -> ps.Series[np.int32]: # type: ignore[no-untyped-def] return s.dt.daysinmonth return self._data.pandas_on_spark.transform_batch(pandas_daysinmonth) diff --git a/python/pyspark/pandas/indexes/timedelta.py b/python/pyspark/pandas/indexes/timedelta.py index 36cbb7cf4e6..b99c78542e7 100644 --- a/python/pyspark/pandas/indexes/timedelta.py +++ b/python/pyspark/pandas/indexes/timedelta.py @@ -19,6 +19,7 @@ from functools import partial import pandas as pd from pandas.api.types import is_hashable # type: ignore[attr-defined] +import numpy as np from pyspark import pandas as ps from pyspark._globals import _NoValue @@ -136,7 +137,7 @@ class TimedeltaIndex(Index): Number of days for each element. """ - def pandas_days(x) -> int: # type: ignore[no-untyped-def] + def pandas_days(x) -> np.int64: # type: ignore[no-untyped-def] return x.days return Index(self.to_series().transform(pandas_days)) diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index 4fb3561de6a..f289ec23fc4 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -73,10 +73,6 @@ class DatetimeIndexTestsMixin: ): ps.DatetimeIndex(["2004-01-01", "2002-12-31", "2000-04-01"]).all() - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43608): Enable DatetimeIndexTests.test_properties for pandas 2.0.0.", - ) def test_properties(self): for psidx, pidx in self.idx_pairs: self.assert_eq(psidx.year, pidx.year) @@ -86,8 +82,6 @@ class DatetimeIndexTestsMixin: self.assert_eq(psidx.minute, pidx.minute) self.assert_eq(psidx.second, pidx.second) self.assert_eq(psidx.microsecond, pidx.microsecond) - self.assert_eq(psidx.week, pidx.week) - self.assert_eq(psidx.weekofyear, pidx.weekofyear) self.assert_eq(psidx.dayofweek, pidx.dayofweek) self.assert_eq(psidx.weekday, pidx.weekday) self.assert_eq(psidx.dayofyear, pidx.dayofyear) @@ -106,6 +100,28 @@ class DatetimeIndexTestsMixin: self.assert_eq(psidx.day_of_year, pidx.day_of_year) self.assert_eq(psidx.day_of_week, pidx.day_of_week) + if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): + # TODO(SPARK-42617): Support isocalendar.week and replace it. + expected_results = [ + ps.Index([1]), + ps.Index([1, 1, 13]), + ps.Index([52, 52, 1]), + ps.Index([52, 52, 52]), + ps.Index([52, 52, 52]), + ps.Index([52, 52, 52]), + ps.Index([52, 52, 52]), + ps.Index([52, 52, 52]), + ps.Index([52, 1, 2]), + ps.Index([13, 26, 39]), + ] + for psidx, expected_result in zip(self.psidxs, expected_results): + self.assert_eq(psidx.week, expected_result) + self.assert_eq(psidx.weekofyear, expected_result) + else: + for psidx, pidx in self.idx_pairs: + self.assert_eq(psidx.week, pidx.week) + self.assert_eq(psidx.weekofyear, pidx.weekofyear) + def test_ceil(self): for psidx, pidx in self.idx_pairs: for freq in self.fixed_freqs: diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py b/python/pyspark/pandas/tests/indexes/test_timedelta.py index a9bb93e65bd..5321f96eeab 100644 --- a/python/pyspark/pandas/tests/indexes/test_timedelta.py +++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py @@ -98,10 +98,6 @@ class TimedeltaIndexTestsMixin: ): psidx.all() - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43705): Enable TimedeltaIndexTests.test_properties for pandas 2.0.0.", - ) def test_properties(self): self.assert_eq(self.psidx.days, self.pidx.days) self.assert_eq(self.psidx.seconds, self.pidx.seconds) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org