(spark) branch master updated: [SPARK-46326][PYTHON][TESTS] Test missing cases for functions (pyspark.sql.functions)

gurwls223 Fri, 08 Dec 2023 03:51:42 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f41bacb07d04 [SPARK-46326][PYTHON][TESTS] Test missing cases for 
functions (pyspark.sql.functions)
f41bacb07d04 is described below

commit f41bacb07d04b21d66c0826420a67da41536e445
Author: Hyukjin Kwon <gurwls...@apache.org>
AuthorDate: Fri Dec 8 20:51:25 2023 +0900

    [SPARK-46326][PYTHON][TESTS] Test missing cases for functions 
(pyspark.sql.functions)
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to improve test coverage by adding the tests for full test 
coverage in `pyspark.sql.functions`. In addition, this PR improves the examples 
by adding doctests a little bit.
    
    ### Why are the changes needed?
    
    For better test coverage, to avoid regressions.
    
    They are not being tested: 
https://app.codecov.io/gh/apache/spark/blob/master/python%2Fpyspark%2Fsql%2Fsession.py
    
    ### Does this PR introduce _any_ user-facing change?
    
    It contains a bit of docstring improvement. Otherwise, test-only.
    
    ### How was this patch tested?
    
    Manually tested the unittests via:
    
    ```bash
    ./python/run-tests --python-executables=python3  --testnames 
'pyspark.sql.tests.connect.test_parity_functions'
    ./python/run-tests --python-executables=python3  --testnames 
'pyspark.sql.tests.test_functions'
    ./python/run-tests --python-executables=python3  --testnames 
'pyspark.sql.functions.builtin'
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #44256 from HyukjinKwon/dataframe-test.
    
    Authored-by: Hyukjin Kwon <gurwls...@apache.org>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 python/pyspark/sql/functions/builtin.py    | 78 ++++++++++++++++++++++++++----
 python/pyspark/sql/tests/test_functions.py | 33 ++++++++++++-
 2 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py 
b/python/pyspark/sql/functions/builtin.py
index 4f8e6a8e1d14..997b641080cf 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -8279,9 +8279,40 @@ def unix_timestamp(
     Examples
     --------
     >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
+
+    Example 1: Returns the current timestamp in UNIX.
+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.range(1).select(sf.unix_timestamp().alias('unix_time')).show()
+    ... # doctest: +SKIP
+    +----------+
+    | unix_time|
+    +----------+
+    |1702018137|
+    +----------+
+
+    Example 2: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp 
string.
+
+    >>> import pyspark.sql.functions as sf
+    >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt'])
+    >>> time_df.select(sf.unix_timestamp('dt').alias('unix_time')).show()
+    +----------+
+    | unix_time|
+    +----------+
+    |1428520332|
+    +----------+
+
+    Example 3: Using user-specified format 'yyyy-MM-dd' parses the timestamp 
string.
+
+    >>> import pyspark.sql.functions as sf
     >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
-    >>> time_df.select(unix_timestamp('dt', 
'yyyy-MM-dd').alias('unix_time')).collect()
-    [Row(unix_time=1428476400)]
+    >>> time_df.select(sf.unix_timestamp('dt', 
'yyyy-MM-dd').alias('unix_time')).show()
+    +----------+
+    | unix_time|
+    +----------+
+    |1428476400|
+    +----------+
+
     >>> spark.conf.unset("spark.sql.session.timeZone")
     """
     if timestamp is None:
@@ -8569,13 +8600,21 @@ def window(
     Examples
     --------
     >>> import datetime
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame(
     ...     [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],
     ... ).toDF("date", "val")
-    >>> w = df.groupBy(window("date", "5 
seconds")).agg(sum("val").alias("sum"))
-    >>> w.select(w.window.start.cast("string").alias("start"),
-    ...          w.window.end.cast("string").alias("end"), "sum").collect()
-    [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)]
+    >>> w = df.groupBy(sf.window("date", "5 
seconds")).agg(sf.sum("val").alias("sum"))
+    >>> w.select(
+    ...     w.window.start.cast("string").alias("start"),
+    ...     w.window.end.cast("string").alias("end"),
+    ...     "sum"
+    ... ).show()
+    +-------------------+-------------------+---+
+    |              start|                end|sum|
+    +-------------------+-------------------+---+
+    |2016-03-11 09:00:05|2016-03-11 09:00:10|  1|
+    +-------------------+-------------------+---+
     """
 
     def check_string_field(field, fieldName):  # type: ignore[no-untyped-def]
@@ -8737,9 +8776,30 @@ def to_unix_timestamp(
     Examples
     --------
     >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
-    >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
-    >>> df.select(to_unix_timestamp(df.e, 
lit("yyyy-MM-dd")).alias('r')).collect()
-    [Row(r=1460098800)]
+
+    Example 1: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp 
string.
+
+    >>> import pyspark.sql.functions as sf
+    >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt'])
+    >>> time_df.select(sf.to_unix_timestamp('dt').alias('unix_time')).show()
+    +----------+
+    | unix_time|
+    +----------+
+    |1428520332|
+    +----------+
+
+    Example 2: Using user-specified format 'yyyy-MM-dd' parses the timestamp 
string.
+
+    >>> import pyspark.sql.functions as sf
+    >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+    >>> time_df.select(
+    ...     sf.to_unix_timestamp('dt', 
sf.lit('yyyy-MM-dd')).alias('unix_time')).show()
+    +----------+
+    | unix_time|
+    +----------+
+    |1428476400|
+    +----------+
+
     >>> spark.conf.unset("spark.sql.session.timeZone")
     """
     if format is not None:
diff --git a/python/pyspark/sql/tests/test_functions.py 
b/python/pyspark/sql/tests/test_functions.py
index 8586fac4e86d..b59417d8a310 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -1002,7 +1002,7 @@ class FunctionsTestsMixin:
             [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ["date", "val"]
         )
 
-        w = df.groupBy(F.window("date", "5 
seconds")).agg(F.sum("val").alias("sum"))
+        w = df.groupBy(F.window("date", "5 seconds", "5 
seconds")).agg(F.sum("val").alias("sum"))
         r = w.select(
             w.window.end.cast("string").alias("end"),
             F.window_time(w.window).cast("string").alias("window_time"),
@@ -1365,6 +1365,37 @@ class FunctionsTestsMixin:
             message_parameters={"arg_name": "numBuckets", "arg_type": "str"},
         )
 
+    def test_to_timestamp_ltz(self):
+        df = self.spark.createDataFrame([("2016-12-31",)], ["e"])
+        df = df.select(F.to_timestamp_ltz(df.e, 
F.lit("yyyy-MM-dd")).alias("r"))
+        self.assertIsInstance(df.first()[0], datetime.datetime)
+
+        df = self.spark.createDataFrame([("2016-12-31",)], ["e"])
+        df = df.select(F.to_timestamp_ltz(df.e).alias("r"))
+        self.assertIsInstance(df.first()[0], datetime.datetime)
+
+    def test_to_timestamp_ntz(self):
+        df = self.spark.createDataFrame([("2016-12-31",)], ["e"])
+        df = df.select(F.to_timestamp_ntz(df.e).alias("r"))
+        self.assertIsInstance(df.first()[0], datetime.datetime)
+
+    def test_convert_timezone(self):
+        df = self.spark.createDataFrame([("2015-04-08",)], ["dt"])
+        df = df.select(
+            F.convert_timezone(F.lit("America/Los_Angeles"), 
F.lit("Asia/Hong_Kong"), "dt")
+        )
+        self.assertIsInstance(df.first()[0], datetime.datetime)
+
+    def test_map_concat(self):
+        df = self.spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') 
as map2")
+        self.assertEqual(
+            df.select(F.map_concat(["map1", "map2"]).alias("map3")).first()[0],
+            {1: "a", 2: "b", 3: "c"},
+        )
+
+    def test_version(self):
+        
self.assertIsInstance(self.spark.range(1).select(F.version()).first()[0], str)
+
     # SPARK-45216: Fix non-deterministic seeded Dataset APIs
     def test_non_deterministic_with_seed(self):
         df = self.spark.createDataFrame([([*range(0, 10, 1)],)], ["a"])


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-46326][PYTHON][TESTS] Test missing cases for functions (pyspark.sql.functions)

Reply via email to