[spark] branch master updated: [SPARK-37563][PYTHON] Implement days, seconds, microseconds properties of TimedeltaIndex

gurwls223 Tue, 14 Dec 2021 18:48:32 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new c75186c  [SPARK-37563][PYTHON] Implement days, seconds, microseconds 
properties of TimedeltaIndex
c75186c is described below

commit c75186cd111b91d13e32159169334d562bdeb767
Author: Xinrong Meng <[email protected]>
AuthorDate: Wed Dec 15 11:47:42 2021 +0900

    [SPARK-37563][PYTHON] Implement days, seconds, microseconds properties of 
TimedeltaIndex
    
    ### What changes were proposed in this pull request?
    Implement days, seconds, microseconds properties of TimedeltaIndex
    
    ### Why are the changes needed?
    To be consistent with pandas.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes.
    ```py
    
    # Positive timedelta
    >>> psidx = ps.TimedeltaIndex(
    ...             [
    ...                 timedelta(days=1),
    ...                 timedelta(seconds=1),
    ...                 timedelta(microseconds=1),
    ...                 timedelta(milliseconds=1),
    ...                 timedelta(minutes=1),
    ...                 timedelta(hours=1),
    ...                 timedelta(weeks=1),
    ...             ],
    ...             name="x",
    ...         )
    
    >>> psidx.days
    Int64Index([1, 0, 0, 0, 0, 0, 7], dtype='int64', name='x')
    >>> psidx.seconds
    Int64Index([0, 1, 0, 0, 60, 3600, 0], dtype='int64', name='x')
    >>> psidx.microseconds
    Int64Index([0, 0, 1, 1000, 0, 0, 0], dtype='int64', name='x')
    
    # Negative timedelta
    >>> psidx = ps.TimedeltaIndex(
    ...             [
    ...                 timedelta(days=-1),
    ...                 timedelta(seconds=-1),
    ...                 timedelta(microseconds=-1),
    ...                 timedelta(milliseconds=-1),
    ...                 timedelta(minutes=-1),
    ...                 timedelta(hours=-1),
    ...                 timedelta(weeks=-1),
    ...             ],
    ...             name="x",
    ...         )
    >>> psidx.days
    Int64Index([-1, -1, -1, -1, -1, -1, -7], dtype='int64', name='x')
    >>> psidx.seconds
    Int64Index([0, 86399, 86399, 86399, 86340, 82800, 0], dtype='int64', 
name='x')
    >>> psidx.microseconds
    Int64Index([0, 0, 999999, 999000, 0, 0, 0], dtype='int64', name='x')
    ```
    ### How was this patch tested?
    Unit tests.
    
    Closes #34825 from xinrong-databricks/timedeltaProperties.
    
    Authored-by: Xinrong Meng <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 dev/sparktestsupport/modules.py                    |  2 +
 .../source/reference/pyspark.pandas/indexing.rst   | 23 ++++--
 python/pyspark/pandas/indexes/timedelta.py         | 76 ++++++++++++++++++++
 python/pyspark/pandas/missing/indexes.py           |  3 -
 python/pyspark/pandas/spark/functions.py           | 11 +++
 .../pyspark/pandas/tests/indexes/test_timedelta.py | 84 ++++++++++++++++++++++
 6 files changed, 189 insertions(+), 10 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 5dd3ab6..297d2ea 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -614,6 +614,7 @@ pyspark_pandas = Module(
         "pyspark.pandas.indexes.base",
         "pyspark.pandas.indexes.category",
         "pyspark.pandas.indexes.datetimes",
+        "pyspark.pandas.indexes.timedelta",
         "pyspark.pandas.indexes.multi",
         "pyspark.pandas.indexes.numeric",
         "pyspark.pandas.spark.accessors",
@@ -632,6 +633,7 @@ pyspark_pandas = Module(
         "pyspark.pandas.tests.data_type_ops.test_string_ops",
         "pyspark.pandas.tests.data_type_ops.test_udt_ops",
         "pyspark.pandas.tests.indexes.test_category",
+        "pyspark.pandas.tests.indexes.test_timedelta",
         "pyspark.pandas.tests.plot.test_frame_plot",
         "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
         "pyspark.pandas.tests.plot.test_frame_plot_plotly",
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst 
b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 0c94012..15539fa 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -336,13 +336,6 @@ DatatimeIndex
 
    DatetimeIndex
 
-TimedeltaIndex
---------------
-.. autosummary::
-   :toctree: api/
-
-   TimedeltaIndex
-
 Time/date components
 ~~~~~~~~~~~~~~~~~~~~
 .. autosummary::
@@ -393,3 +386,19 @@ Time-specific operations
    DatetimeIndex.ceil
    DatetimeIndex.month_name
    DatetimeIndex.day_name
+
+TimedeltaIndex
+--------------
+.. autosummary::
+   :toctree: api/
+
+   TimedeltaIndex
+
+Components
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   TimedeltaIndex.days
+   TimedeltaIndex.seconds
+   TimedeltaIndex.microseconds
diff --git a/python/pyspark/pandas/indexes/timedelta.py 
b/python/pyspark/pandas/indexes/timedelta.py
index f5e94d7..811c9d6 100644
--- a/python/pyspark/pandas/indexes/timedelta.py
+++ b/python/pyspark/pandas/indexes/timedelta.py
@@ -25,6 +25,19 @@ from pyspark._globals import _NoValue
 from pyspark.pandas.indexes.base import Index
 from pyspark.pandas.missing.indexes import MissingPandasLikeTimedeltaIndex
 from pyspark.pandas.series import Series
+from pyspark.pandas.spark import functions as SF
+from pyspark.sql import functions as F
+
+
+HOURS_PER_DAY = 24
+MINUTES_PER_HOUR = 60
+SECONDS_PER_MINUTE = 60
+MILLIS_PER_SECOND = 1000
+MICROS_PER_MILLIS = 1000
+
+SECONDS_PER_HOUR = MINUTES_PER_HOUR * SECONDS_PER_MINUTE
+SECONDS_PER_DAY = HOURS_PER_DAY * SECONDS_PER_HOUR
+MICROS_PER_SECOND = MILLIS_PER_SECOND * MICROS_PER_MILLIS
 
 
 class TimedeltaIndex(Index):
@@ -111,3 +124,66 @@ class TimedeltaIndex(Index):
                 return partial(property_or_func, self)
 
         raise AttributeError("'TimedeltaIndex' object has no attribute 
'{}'".format(item))
+
+    @property
+    def days(self) -> Index:
+        """
+        Number of days for each element.
+        """
+
+        @no_type_check
+        def pandas_days(x) -> int:
+            return x.days
+
+        return Index(self.to_series().transform(pandas_days))
+
+    @property
+    def seconds(self) -> Index:
+        """
+        Number of seconds (>= 0 and less than 1 day) for each element.
+        """
+
+        @no_type_check
+        def get_seconds(scol):
+            hour_scol = SF.date_part("HOUR", scol)
+            minute_scol = SF.date_part("MINUTE", scol)
+            second_scol = SF.date_part("SECOND", scol)
+            return (
+                F.when(
+                    hour_scol < 0,
+                    SECONDS_PER_DAY + hour_scol * SECONDS_PER_HOUR,
+                ).otherwise(hour_scol * SECONDS_PER_HOUR)
+                + F.when(
+                    minute_scol < 0,
+                    SECONDS_PER_DAY + minute_scol * SECONDS_PER_MINUTE,
+                ).otherwise(minute_scol * SECONDS_PER_MINUTE)
+                + F.when(
+                    second_scol < 0,
+                    SECONDS_PER_DAY + second_scol,
+                ).otherwise(second_scol)
+            ).cast("int")
+
+        return Index(self.to_series().spark.transform(get_seconds))
+
+    @property
+    def microseconds(self) -> Index:
+        """
+        Number of microseconds (>= 0 and less than 1 second) for each element.
+        """
+
+        @no_type_check
+        def get_microseconds(scol):
+            second_scol = SF.date_part("SECOND", scol)
+            return (
+                (
+                    F.when(
+                        (second_scol >= 0) & (second_scol < 1),
+                        second_scol,
+                    )
+                    .when(second_scol < 0, 1 + second_scol)
+                    .otherwise(0)
+                )
+                * MICROS_PER_SECOND
+            ).cast("int")
+
+        return Index(self.to_series().spark.transform(get_microseconds))
diff --git a/python/pyspark/pandas/missing/indexes.py 
b/python/pyspark/pandas/missing/indexes.py
index f1ddccf..73dab4a 100644
--- a/python/pyspark/pandas/missing/indexes.py
+++ b/python/pyspark/pandas/missing/indexes.py
@@ -103,9 +103,6 @@ class 
MissingPandasLikeDatetimeIndex(MissingPandasLikeIndex):
 class MissingPandasLikeTimedeltaIndex(MissingPandasLikeIndex):
 
     # Properties
-    days = _unsupported_property("days", cls="TimedeltaIndex")
-    seconds = _unsupported_property("seconds", cls="TimedeltaIndex")
-    microseconds = _unsupported_property("microseconds", cls="TimedeltaIndex")
     nanoseconds = _unsupported_property("nanoseconds", cls="TimedeltaIndex")
     components = _unsupported_property("components", cls="TimedeltaIndex")
     inferred_freq = _unsupported_property("inferred_freq", 
cls="TimedeltaIndex")
diff --git a/python/pyspark/pandas/spark/functions.py 
b/python/pyspark/pandas/spark/functions.py
index 73251fc..dcc8fc8 100644
--- a/python/pyspark/pandas/spark/functions.py
+++ b/python/pyspark/pandas/spark/functions.py
@@ -45,6 +45,17 @@ def repeat(col: Column, n: Union[int, Column]) -> Column:
     return _call_udf(sc, "repeat", _to_java_column(col), n)
 
 
+def date_part(field: Union[str, Column], source: Column) -> Column:
+    """
+    Extracts a part of the date/timestamp or interval source.
+    """
+    sc = SparkContext._active_spark_context  # type: ignore[attr-defined]
+    field = (
+        _to_java_column(field) if isinstance(field, Column) else 
_create_column_from_literal(field)
+    )
+    return _call_udf(sc, "date_part", field, _to_java_column(source))
+
+
 def lit(literal: Any) -> Column:
     """
     Creates a Column of literal value.
diff --git a/python/pyspark/pandas/tests/indexes/test_timedelta.py 
b/python/pyspark/pandas/tests/indexes/test_timedelta.py
new file mode 100644
index 0000000..bdb98a3
--- /dev/null
+++ b/python/pyspark/pandas/tests/indexes/test_timedelta.py
@@ -0,0 +1,84 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from datetime import timedelta
+
+import pandas as pd
+
+import pyspark.pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class TimedeltaIndexTest(PandasOnSparkTestCase, TestUtils):
+    @property
+    def pidx(self):
+        return pd.TimedeltaIndex(
+            [
+                timedelta(days=1),
+                timedelta(seconds=1),
+                timedelta(microseconds=1),
+                timedelta(milliseconds=1),
+                timedelta(minutes=1),
+                timedelta(hours=1),
+                timedelta(weeks=1),
+            ],
+            name="x",
+        )
+
+    @property
+    def neg_pidx(self):
+        return pd.TimedeltaIndex(
+            [
+                timedelta(days=-1),
+                timedelta(seconds=-1),
+                timedelta(microseconds=-1),
+                timedelta(milliseconds=-1),
+                timedelta(minutes=-1),
+                timedelta(hours=-1),
+                timedelta(weeks=-1),
+            ],
+            name="x",
+        )
+
+    @property
+    def psidx(self):
+        return ps.from_pandas(self.pidx)
+
+    @property
+    def neg_psidx(self):
+        return ps.from_pandas(self.neg_pidx)
+
+    def test_properties(self):
+        self.assert_eq(self.psidx.days, self.pidx.days)
+        self.assert_eq(self.psidx.seconds, self.pidx.seconds)
+        self.assert_eq(self.psidx.microseconds, self.pidx.microseconds)
+        self.assert_eq(self.neg_psidx.days, self.neg_pidx.days)
+        self.assert_eq(self.neg_psidx.seconds, self.neg_pidx.seconds)
+        self.assert_eq(self.neg_psidx.microseconds, self.neg_pidx.microseconds)
+
+
+if __name__ == "__main__":
+    import unittest
+    from pyspark.pandas.tests.indexes.test_timedelta import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", 
verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-37563][PYTHON] Implement days, seconds, microseconds properties of TimedeltaIndex

Reply via email to