[spark] branch master updated: [SPARK-37738][PYTHON] Support column type inputs for second arg of date manipulation functions

zero323 Thu, 30 Dec 2021 07:09:01 -0800

This is an automated email from the ASF dual-hosted git repository.

zero323 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 899dec2  [SPARK-37738][PYTHON] Support column type inputs for second 
arg of date manipulation functions
899dec2 is described below

commit 899dec2ea36bcd934f1512b26e7fc8903e9f5ba8
Author: Daniel Davies <ddav...@palantir.com>
AuthorDate: Thu Dec 30 16:07:39 2021 +0100

    [SPARK-37738][PYTHON] Support column type inputs for second arg of date 
manipulation functions
    
    ### What changes were proposed in this pull request?
    See https://issues.apache.org/jira/browse/SPARK-37738
    
    There seems to be a skew in the Scala Spark API & PySpark API; namely, 
date_add/ date_sub/ add_months take an 'int' type for the 'days' parameter in 
PySpark, but can accept a column or an integer in Scala.
    
    This PR makes both types available to the 'days' parameter in PySpark.
    
    ### Why are the changes needed?
    Users should see a consistent API across Python & Scala side processing.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes- additive only.
    
    >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'add'])
    >>> df.select(date_add(df.dt, df.add).alias('next_date')).collect()
    [Row(next_date=datetime.date(2015, 4, 10))]
    
    ### How was this patch tested?
    3 new unit tests
    
    Closes #35032 from Daniel-Davies/master.
    
    Lead-authored-by: Daniel Davies <ddav...@palantir.com>
    Co-authored-by: Daniel-Davies 
<33356828+daniel-dav...@users.noreply.github.com>
    Signed-off-by: zero323 <mszymkiew...@gmail.com>
---
 python/pyspark/sql/functions.py            | 33 ++++++++++++-----
 python/pyspark/sql/tests/test_functions.py | 57 ++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a0f2bbf..4791d3c 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2200,7 +2200,7 @@ def make_date(year: "ColumnOrName", month: 
"ColumnOrName", day: "ColumnOrName")
     return Column(jc)
 
 
-def date_add(start: "ColumnOrName", days: int) -> Column:
+def date_add(start: "ColumnOrName", days: Union["ColumnOrName", int]) -> 
Column:
     """
     Returns the date that is `days` days after `start`
 
@@ -2208,16 +2208,21 @@ def date_add(start: "ColumnOrName", days: int) -> 
Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'add'])
     >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
     [Row(next_date=datetime.date(2015, 4, 9))]
+    >>> df.select(date_add(df.dt, 
df.add.cast('integer')).alias('next_date')).collect()
+    [Row(next_date=datetime.date(2015, 4, 10))]
     """
     sc = SparkContext._active_spark_context
     assert sc is not None and sc._jvm is not None
-    return Column(sc._jvm.functions.date_add(_to_java_column(start), days))
+
+    days = lit(days) if isinstance(days, int) else days
+
+    return Column(sc._jvm.functions.date_add(_to_java_column(start), 
_to_java_column(days)))
 
 
-def date_sub(start: "ColumnOrName", days: int) -> Column:
+def date_sub(start: "ColumnOrName", days: Union["ColumnOrName", int]) -> 
Column:
     """
     Returns the date that is `days` days before `start`
 
@@ -2225,13 +2230,18 @@ def date_sub(start: "ColumnOrName", days: int) -> 
Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'sub'])
     >>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect()
     [Row(prev_date=datetime.date(2015, 4, 7))]
+    >>> df.select(date_sub(df.dt, 
df.sub.cast('integer')).alias('prev_date')).collect()
+    [Row(prev_date=datetime.date(2015, 4, 6))]
     """
     sc = SparkContext._active_spark_context
     assert sc is not None and sc._jvm is not None
-    return Column(sc._jvm.functions.date_sub(_to_java_column(start), days))
+
+    days = lit(days) if isinstance(days, int) else days
+
+    return Column(sc._jvm.functions.date_sub(_to_java_column(start), 
_to_java_column(days)))
 
 
 def datediff(end: "ColumnOrName", start: "ColumnOrName") -> Column:
@@ -2251,7 +2261,7 @@ def datediff(end: "ColumnOrName", start: "ColumnOrName") 
-> Column:
     return Column(sc._jvm.functions.datediff(_to_java_column(end), 
_to_java_column(start)))
 
 
-def add_months(start: "ColumnOrName", months: int) -> Column:
+def add_months(start: "ColumnOrName", months: Union["ColumnOrName", int]) -> 
Column:
     """
     Returns the date that is `months` months after `start`
 
@@ -2259,13 +2269,18 @@ def add_months(start: "ColumnOrName", months: int) -> 
Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> df = spark.createDataFrame([('2015-04-08', 2)], ['dt', 'add'])
     >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()
     [Row(next_month=datetime.date(2015, 5, 8))]
+    >>> df.select(add_months(df.dt, 
df.add.cast('integer')).alias('next_month')).collect()
+    [Row(next_month=datetime.date(2015, 6, 8))]
     """
     sc = SparkContext._active_spark_context
     assert sc is not None and sc._jvm is not None
-    return Column(sc._jvm.functions.add_months(_to_java_column(start), months))
+
+    months = lit(months) if isinstance(months, int) else months
+
+    return Column(sc._jvm.functions.add_months(_to_java_column(start), 
_to_java_column(months)))
 
 
 def months_between(date1: "ColumnOrName", date2: "ColumnOrName", roundOff: 
bool = True) -> Column:
diff --git a/python/pyspark/sql/tests/test_functions.py 
b/python/pyspark/sql/tests/test_functions.py
index b6a3278..eb3b433 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -43,6 +43,9 @@ from pyspark.sql.functions import (
     csc,
     cot,
     make_date,
+    date_add,
+    date_sub,
+    add_months,
 )
 from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils
 
@@ -286,6 +289,60 @@ class FunctionsTests(ReusedSQLTestCase):
         row = df.select(dayofweek(df.date)).first()
         self.assertEqual(row[0], 2)
 
+    # Test added for SPARK-37738; change Python API to accept both col & int 
as input
+    def test_date_add_function(self):
+        dt = datetime.date(2021, 12, 27)
+
+        # Note; number var in Python gets converted to LongType column;
+        # this is not supported by the function, so cast to Integer explicitly
+        df = self.spark.createDataFrame([Row(date=dt, add=2)], "date date, add 
integer")
+
+        self.assertTrue(
+            all(
+                df.select(
+                    date_add(df.date, df.add) == datetime.date(2021, 12, 29),
+                    date_add(df.date, "add") == datetime.date(2021, 12, 29),
+                    date_add(df.date, 3) == datetime.date(2021, 12, 30),
+                ).first()
+            )
+        )
+
+    # Test added for SPARK-37738; change Python API to accept both col & int 
as input
+    def test_date_sub_function(self):
+        dt = datetime.date(2021, 12, 27)
+
+        # Note; number var in Python gets converted to LongType column;
+        # this is not supported by the function, so cast to Integer explicitly
+        df = self.spark.createDataFrame([Row(date=dt, sub=2)], "date date, sub 
integer")
+
+        self.assertTrue(
+            all(
+                df.select(
+                    date_sub(df.date, df.sub) == datetime.date(2021, 12, 25),
+                    date_sub(df.date, "sub") == datetime.date(2021, 12, 25),
+                    date_sub(df.date, 3) == datetime.date(2021, 12, 24),
+                ).first()
+            )
+        )
+
+    # Test added for SPARK-37738; change Python API to accept both col & int 
as input
+    def test_add_months_function(self):
+        dt = datetime.date(2021, 12, 27)
+
+        # Note; number in Python gets converted to LongType column;
+        # this is not supported by the function, so cast to Integer explicitly
+        df = self.spark.createDataFrame([Row(date=dt, add=2)], "date date, add 
integer")
+
+        self.assertTrue(
+            all(
+                df.select(
+                    add_months(df.date, df.add) == datetime.date(2022, 2, 27),
+                    add_months(df.date, "add") == datetime.date(2022, 2, 27),
+                    add_months(df.date, 3) == datetime.date(2022, 3, 27),
+                ).first()
+            )
+        )
+
     def test_make_date(self):
         # SPARK-36554: expose make_date expression
         df = self.spark.createDataFrame([(2020, 6, 26)], ["Y", "M", "D"])

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-37738][PYTHON] Support column type inputs for second arg of date manipulation functions

Reply via email to