This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f41bacb07d04 [SPARK-46326][PYTHON][TESTS] Test missing cases for
functions (pyspark.sql.functions)
f41bacb07d04 is described below
commit f41bacb07d04b21d66c0826420a67da41536e445
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Fri Dec 8 20:51:25 2023 +0900
[SPARK-46326][PYTHON][TESTS] Test missing cases for functions
(pyspark.sql.functions)
### What changes were proposed in this pull request?
This PR proposes to improve test coverage by adding the tests for full test
coverage in `pyspark.sql.functions`. In addition, this PR improves the examples
by adding doctests a little bit.
### Why are the changes needed?
For better test coverage, to avoid regressions.
They are not being tested:
https://app.codecov.io/gh/apache/spark/blob/master/python%2Fpyspark%2Fsql%2Fsession.py
### Does this PR introduce _any_ user-facing change?
It contains a bit of docstring improvement. Otherwise, test-only.
### How was this patch tested?
Manually tested the unittests via:
```bash
./python/run-tests --python-executables=python3 --testnames
'pyspark.sql.tests.connect.test_parity_functions'
./python/run-tests --python-executables=python3 --testnames
'pyspark.sql.tests.test_functions'
./python/run-tests --python-executables=python3 --testnames
'pyspark.sql.functions.builtin'
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #44256 from HyukjinKwon/dataframe-test.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 78 ++++++++++++++++++++++++++----
python/pyspark/sql/tests/test_functions.py | 33 ++++++++++++-
2 files changed, 101 insertions(+), 10 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 4f8e6a8e1d14..997b641080cf 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -8279,9 +8279,40 @@ def unix_timestamp(
Examples
--------
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
+
+ Example 1: Returns the current timestamp in UNIX.
+
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(1).select(sf.unix_timestamp().alias('unix_time')).show()
+ ... # doctest: +SKIP
+ +----------+
+ | unix_time|
+ +----------+
+ |1702018137|
+ +----------+
+
+ Example 2: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp
string.
+
+ >>> import pyspark.sql.functions as sf
+ >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt'])
+ >>> time_df.select(sf.unix_timestamp('dt').alias('unix_time')).show()
+ +----------+
+ | unix_time|
+ +----------+
+ |1428520332|
+ +----------+
+
+ Example 3: Using user-specified format 'yyyy-MM-dd' parses the timestamp
string.
+
+ >>> import pyspark.sql.functions as sf
>>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
- >>> time_df.select(unix_timestamp('dt',
'yyyy-MM-dd').alias('unix_time')).collect()
- [Row(unix_time=1428476400)]
+ >>> time_df.select(sf.unix_timestamp('dt',
'yyyy-MM-dd').alias('unix_time')).show()
+ +----------+
+ | unix_time|
+ +----------+
+ |1428476400|
+ +----------+
+
>>> spark.conf.unset("spark.sql.session.timeZone")
"""
if timestamp is None:
@@ -8569,13 +8600,21 @@ def window(
Examples
--------
>>> import datetime
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame(
... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],
... ).toDF("date", "val")
- >>> w = df.groupBy(window("date", "5
seconds")).agg(sum("val").alias("sum"))
- >>> w.select(w.window.start.cast("string").alias("start"),
- ... w.window.end.cast("string").alias("end"), "sum").collect()
- [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)]
+ >>> w = df.groupBy(sf.window("date", "5
seconds")).agg(sf.sum("val").alias("sum"))
+ >>> w.select(
+ ... w.window.start.cast("string").alias("start"),
+ ... w.window.end.cast("string").alias("end"),
+ ... "sum"
+ ... ).show()
+ +-------------------+-------------------+---+
+ | start| end|sum|
+ +-------------------+-------------------+---+
+ |2016-03-11 09:00:05|2016-03-11 09:00:10| 1|
+ +-------------------+-------------------+---+
"""
def check_string_field(field, fieldName): # type: ignore[no-untyped-def]
@@ -8737,9 +8776,30 @@ def to_unix_timestamp(
Examples
--------
>>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
- >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])
- >>> df.select(to_unix_timestamp(df.e,
lit("yyyy-MM-dd")).alias('r')).collect()
- [Row(r=1460098800)]
+
+ Example 1: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp
string.
+
+ >>> import pyspark.sql.functions as sf
+ >>> time_df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['dt'])
+ >>> time_df.select(sf.to_unix_timestamp('dt').alias('unix_time')).show()
+ +----------+
+ | unix_time|
+ +----------+
+ |1428520332|
+ +----------+
+
+ Example 2: Using user-specified format 'yyyy-MM-dd' parses the timestamp
string.
+
+ >>> import pyspark.sql.functions as sf
+ >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])
+ >>> time_df.select(
+ ... sf.to_unix_timestamp('dt',
sf.lit('yyyy-MM-dd')).alias('unix_time')).show()
+ +----------+
+ | unix_time|
+ +----------+
+ |1428476400|
+ +----------+
+
>>> spark.conf.unset("spark.sql.session.timeZone")
"""
if format is not None:
diff --git a/python/pyspark/sql/tests/test_functions.py
b/python/pyspark/sql/tests/test_functions.py
index 8586fac4e86d..b59417d8a310 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -1002,7 +1002,7 @@ class FunctionsTestsMixin:
[(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ["date", "val"]
)
- w = df.groupBy(F.window("date", "5
seconds")).agg(F.sum("val").alias("sum"))
+ w = df.groupBy(F.window("date", "5 seconds", "5
seconds")).agg(F.sum("val").alias("sum"))
r = w.select(
w.window.end.cast("string").alias("end"),
F.window_time(w.window).cast("string").alias("window_time"),
@@ -1365,6 +1365,37 @@ class FunctionsTestsMixin:
message_parameters={"arg_name": "numBuckets", "arg_type": "str"},
)
+ def test_to_timestamp_ltz(self):
+ df = self.spark.createDataFrame([("2016-12-31",)], ["e"])
+ df = df.select(F.to_timestamp_ltz(df.e,
F.lit("yyyy-MM-dd")).alias("r"))
+ self.assertIsInstance(df.first()[0], datetime.datetime)
+
+ df = self.spark.createDataFrame([("2016-12-31",)], ["e"])
+ df = df.select(F.to_timestamp_ltz(df.e).alias("r"))
+ self.assertIsInstance(df.first()[0], datetime.datetime)
+
+ def test_to_timestamp_ntz(self):
+ df = self.spark.createDataFrame([("2016-12-31",)], ["e"])
+ df = df.select(F.to_timestamp_ntz(df.e).alias("r"))
+ self.assertIsInstance(df.first()[0], datetime.datetime)
+
+ def test_convert_timezone(self):
+ df = self.spark.createDataFrame([("2015-04-08",)], ["dt"])
+ df = df.select(
+ F.convert_timezone(F.lit("America/Los_Angeles"),
F.lit("Asia/Hong_Kong"), "dt")
+ )
+ self.assertIsInstance(df.first()[0], datetime.datetime)
+
+ def test_map_concat(self):
+ df = self.spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c')
as map2")
+ self.assertEqual(
+ df.select(F.map_concat(["map1", "map2"]).alias("map3")).first()[0],
+ {1: "a", 2: "b", 3: "c"},
+ )
+
+ def test_version(self):
+
self.assertIsInstance(self.spark.range(1).select(F.version()).first()[0], str)
+
# SPARK-45216: Fix non-deterministic seeded Dataset APIs
def test_non_deterministic_with_seed(self):
df = self.spark.createDataFrame([([*range(0, 10, 1)],)], ["a"])
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]