This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c75186c [SPARK-37563][PYTHON] Implement days, seconds, microseconds
properties of TimedeltaIndex
c75186c is described below
commit c75186cd111b91d13e32159169334d562bdeb767
Author: Xinrong Meng <[email protected]>
AuthorDate: Wed Dec 15 11:47:42 2021 +0900
[SPARK-37563][PYTHON] Implement days, seconds, microseconds properties of
TimedeltaIndex
### What changes were proposed in this pull request?
Implement days, seconds, microseconds properties of TimedeltaIndex
### Why are the changes needed?
To be consistent with pandas.
### Does this PR introduce _any_ user-facing change?
Yes.
```py
# Positive timedelta
>>> psidx = ps.TimedeltaIndex(
... [
... timedelta(days=1),
... timedelta(seconds=1),
... timedelta(microseconds=1),
... timedelta(milliseconds=1),
... timedelta(minutes=1),
... timedelta(hours=1),
... timedelta(weeks=1),
... ],
... name="x",
... )
>>> psidx.days
Int64Index([1, 0, 0, 0, 0, 0, 7], dtype='int64', name='x')
>>> psidx.seconds
Int64Index([0, 1, 0, 0, 60, 3600, 0], dtype='int64', name='x')
>>> psidx.microseconds
Int64Index([0, 0, 1, 1000, 0, 0, 0], dtype='int64', name='x')
# Negative timedelta
>>> psidx = ps.TimedeltaIndex(
... [
... timedelta(days=-1),
... timedelta(seconds=-1),
... timedelta(microseconds=-1),
... timedelta(milliseconds=-1),
... timedelta(minutes=-1),
... timedelta(hours=-1),
... timedelta(weeks=-1),
... ],
... name="x",
... )
>>> psidx.days
Int64Index([-1, -1, -1, -1, -1, -1, -7], dtype='int64', name='x')
>>> psidx.seconds
Int64Index([0, 86399, 86399, 86399, 86340, 82800, 0], dtype='int64',
name='x')
>>> psidx.microseconds
Int64Index([0, 0, 999999, 999000, 0, 0, 0], dtype='int64', name='x')
```
### How was this patch tested?
Unit tests.
Closes #34825 from xinrong-databricks/timedeltaProperties.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/sparktestsupport/modules.py | 2 +
.../source/reference/pyspark.pandas/indexing.rst | 23 ++++--
python/pyspark/pandas/indexes/timedelta.py | 76 ++++++++++++++++++++
python/pyspark/pandas/missing/indexes.py | 3 -
python/pyspark/pandas/spark/functions.py | 11 +++
.../pyspark/pandas/tests/indexes/test_timedelta.py | 84 ++++++++++++++++++++++
6 files changed, 189 insertions(+), 10 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 5dd3ab6..297d2ea 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -614,6 +614,7 @@ pyspark_pandas = Module(
"pyspark.pandas.indexes.base",
"pyspark.pandas.indexes.category",
"pyspark.pandas.indexes.datetimes",
+ "pyspark.pandas.indexes.timedelta",
"pyspark.pandas.indexes.multi",
"pyspark.pandas.indexes.numeric",
"pyspark.pandas.spark.accessors",
@@ -632,6 +633,7 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.data_type_ops.test_udt_ops",
"pyspark.pandas.tests.indexes.test_category",
+ "pyspark.pandas.tests.indexes.test_timedelta",
"pyspark.pandas.tests.plot.test_frame_plot",
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
"pyspark.pandas.tests.plot.test_frame_plot_plotly",
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst
b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 0c94012..15539fa 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -336,13 +336,6 @@ DatatimeIndex
DatetimeIndex
-TimedeltaIndex
---------------
-.. autosummary::
- :toctree: api/
-
- TimedeltaIndex
-
Time/date components
~~~~~~~~~~~~~~~~~~~~
.. autosummary::
@@ -393,3 +386,19 @@ Time-specific operations
DatetimeIndex.ceil
DatetimeIndex.month_name
DatetimeIndex.day_name
+
+TimedeltaIndex
+--------------
+.. autosummary::
+ :toctree: api/
+
+ TimedeltaIndex
+
+Components
+~~~~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ TimedeltaIndex.days
+ TimedeltaIndex.seconds
+ TimedeltaIndex.microseconds
diff --git a/python/pyspark/pandas/indexes/timedelta.py
b/python/pyspark/pandas/indexes/timedelta.py
index f5e94d7..811c9d6 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -25,6 +25,19 @@ from pyspark._globals import _NoValue
from pyspark.pandas.indexes.base import Index
from pyspark.pandas.missing.indexes import MissingPandasLikeTimedeltaIndex
from pyspark.pandas.series import Series
+from pyspark.pandas.spark import functions as SF
+from pyspark.sql import functions as F
+
+
+HOURS_PER_DAY = 24
+MINUTES_PER_HOUR = 60
+SECONDS_PER_MINUTE = 60
+MILLIS_PER_SECOND = 1000
+MICROS_PER_MILLIS = 1000
+
+SECONDS_PER_HOUR = MINUTES_PER_HOUR * SECONDS_PER_MINUTE
+SECONDS_PER_DAY = HOURS_PER_DAY * SECONDS_PER_HOUR
+MICROS_PER_SECOND = MILLIS_PER_SECOND * MICROS_PER_MILLIS
class TimedeltaIndex(Index):
@@ -111,3 +124,66 @@ class TimedeltaIndex(Index):
return partial(property_or_func, self)
raise AttributeError("'TimedeltaIndex' object has no attribute
'{}'".format(item))
+
+ @property
+ def days(self) -> Index:
+ """
+ Number of days for each element.
+ """
+
+ @no_type_check
+ def pandas_days(x) -> int:
+ return x.days
+
+ return Index(self.to_series().transform(pandas_days))
+
+ @property
+ def seconds(self) -> Index:
+ """
+ Number of seconds (>= 0 and less than 1 day) for each element.
+ """
+
+ @no_type_check
+ def get_seconds(scol):
+ hour_scol = SF.date_part("HOUR", scol)
+ minute_scol = SF.date_part("MINUTE", scol)
+ second_scol = SF.date_part("SECOND", scol)
+ return (
+ F.when(
+ hour_scol < 0,
+ SECONDS_PER_DAY + hour_scol * SECONDS_PER_HOUR,
+ ).otherwise(hour_scol * SECONDS_PER_HOUR)
+ + F.when(
+ minute_scol < 0,
+ SECONDS_PER_DAY + minute_scol * SECONDS_PER_MINUTE,
+ ).otherwise(minute_scol * SECONDS_PER_MINUTE)
+ + F.when(
+ second_scol < 0,
+ SECONDS_PER_DAY + second_scol,
+ ).otherwise(second_scol)
+ ).cast("int")
+
+ return Index(self.to_series().spark.transform(get_seconds))
+
+ @property
+ def microseconds(self) -> Index:
+ """
+ Number of microseconds (>= 0 and less than 1 second) for each element.
+ """
+
+ @no_type_check
+ def get_microseconds(scol):
+ second_scol = SF.date_part("SECOND", scol)
+ return (
+ (
+ F.when(
+ (second_scol >= 0) & (second_scol < 1),
+ second_scol,
+ )
+ .when(second_scol < 0, 1 + second_scol)
+ .otherwise(0)
+ )
+ * MICROS_PER_SECOND
+ ).cast("int")
+
+ return Index(self.to_series().spark.transform(get_microseconds))
diff --git a/python/pyspark/pandas/missing/indexes.py
b/python/pyspark/pandas/missing/indexes.py
index f1ddccf..73dab4a 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -103,9 +103,6 @@ class
MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex):
class MissingPandasLikeTimedeltaIndex(MissingPandasLikeIndex):
# Properties
- days = _unsupported_property("days", cls="TimedeltaIndex")
- seconds = _unsupported_property("seconds", cls="TimedeltaIndex")
- microseconds = _unsupported_property("microseconds", cls="TimedeltaIndex")
nanoseconds = _unsupported_property("nanoseconds", cls="TimedeltaIndex")
components = _unsupported_property("components", cls="TimedeltaIndex")
inferred_freq = _unsupported_property("inferred_freq",
cls="TimedeltaIndex")
diff --git a/python/pyspark/pandas/spark/functions.py
b/python/pyspark/pandas/spark/functions.py
index 73251fc..dcc8fc8 100644
--- a/python/pyspark/pandas/spark/functions.py
+++ b/python/pyspark/pandas/spark/functions.py
@@ -45,6 +45,17 @@ def repeat(col: Column, n: Union[int, Column]) -> Column:
return _call_udf(sc, "repeat", _to_java_column(col), n)
+def date_part(field: Union[str, Column], source: Column) -> Column:
+ """
+ Extracts a part of the date/timestamp or interval source.
+ """
+ sc = SparkContext._active_spark_context # type: ignore[attr-defined]
+ field = (
+ _to_java_column(field) if isinstance(field, Column) else
_create_column_from_literal(field)
+ )
+ return _call_udf(sc, "date_part", field, _to_java_column(source))
+
+
def lit(literal: Any) -> Column:
"""
Creates a Column of literal value.
diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py
b/python/pyspark/pandas/tests/indexes/test_timedelta.py
new file mode 100644
index 0000000..bdb98a3
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -0,0 +1,84 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from datetime import timedelta
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class TimedeltaIndexTest(PandasOnSparkTestCase, TestUtils):
+ @property
+ def pidx(self):
+ return pd.TimedeltaIndex(
+ [
+ timedelta(days=1),
+ timedelta(seconds=1),
+ timedelta(microseconds=1),
+ timedelta(milliseconds=1),
+ timedelta(minutes=1),
+ timedelta(hours=1),
+ timedelta(weeks=1),
+ ],
+ name="x",
+ )
+
+ @property
+ def neg_pidx(self):
+ return pd.TimedeltaIndex(
+ [
+ timedelta(days=-1),
+ timedelta(seconds=-1),
+ timedelta(microseconds=-1),
+ timedelta(milliseconds=-1),
+ timedelta(minutes=-1),
+ timedelta(hours=-1),
+ timedelta(weeks=-1),
+ ],
+ name="x",
+ )
+
+ @property
+ def psidx(self):
+ return ps.from_pandas(self.pidx)
+
+ @property
+ def neg_psidx(self):
+ return ps.from_pandas(self.neg_pidx)
+
+ def test_properties(self):
+ self.assert_eq(self.psidx.days, self.pidx.days)
+ self.assert_eq(self.psidx.seconds, self.pidx.seconds)
+ self.assert_eq(self.psidx.microseconds, self.pidx.microseconds)
+ self.assert_eq(self.neg_psidx.days, self.neg_pidx.days)
+ self.assert_eq(self.neg_psidx.seconds, self.neg_pidx.seconds)
+ self.assert_eq(self.neg_psidx.microseconds, self.neg_pidx.microseconds)
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.indexes.test_timedelta import * # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]