This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f3e162336f65 [SPARK-46543][PYTHON][CONNECT] Make `json_tuple` throw
PySparkValueError for empty fields
f3e162336f65 is described below
commit f3e162336f65286caebff053f54c9380e35a21ae
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Jan 2 11:43:31 2024 +0800
[SPARK-46543][PYTHON][CONNECT] Make `json_tuple` throw PySparkValueError
for empty fields
### What changes were proposed in this pull request?
Make `json_tuple` throw PySparkValueError for empty fields
### Why are the changes needed?
Python side should have the same check as the Scala side:
https://github.com/apache/spark/blob/fa4096eb6aba4c66f0d9c5dcbabdfc0804064fff/sql/core/src/main/scala/org/apache/spark/sql/functions.scala#L6330-L6334
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
added ut
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44534 from zhengruifeng/py_check_functions.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/functions/builtin.py | 5 +++++
python/pyspark/sql/functions/builtin.py | 5 +++++
python/pyspark/sql/tests/test_functions.py | 14 ++++++++++++++
3 files changed, 24 insertions(+)
diff --git a/python/pyspark/sql/connect/functions/builtin.py
b/python/pyspark/sql/connect/functions/builtin.py
index baf8dc82fd84..461694362612 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -1928,6 +1928,11 @@ inline_outer.__doc__ = pysparkfuncs.inline_outer.__doc__
def json_tuple(col: "ColumnOrName", *fields: str) -> Column:
+ if len(fields) == 0:
+ raise PySparkValueError(
+ error_class="CANNOT_BE_EMPTY",
+ message_parameters={"item": "field"},
+ )
return _invoke_function("json_tuple", _to_col(col), *[lit(field) for field
in fields])
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 7f5e90739507..4147e71f4bb4 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -14246,6 +14246,11 @@ def json_tuple(col: "ColumnOrName", *fields: str) ->
Column:
>>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()
[Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12',
c1=None)]
"""
+ if len(fields) == 0:
+ raise PySparkValueError(
+ error_class="CANNOT_BE_EMPTY",
+ message_parameters={"item": "field"},
+ )
sc = _get_active_spark_context()
return _invoke_function("json_tuple", _to_java_column(col), _to_seq(sc,
fields))
diff --git a/python/pyspark/sql/tests/test_functions.py
b/python/pyspark/sql/tests/test_functions.py
index df1ddd0301ad..aaf58136508a 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -1452,6 +1452,20 @@ class FunctionsTestsMixin:
self.assertIsInstance(df.first()[0], datetime.datetime)
self.assertEqual(df.schema.names[0], "now()")
+ def test_json_tuple_empty_fields(self):
+ df = self.spark.createDataFrame(
+ [
+ ("1", """{"f1": "value1", "f2": "value2"}"""),
+ ("2", """{"f1": "value12"}"""),
+ ],
+ ("key", "jstring"),
+ )
+ self.assertRaisesRegex(
+ PySparkValueError,
+ "At least one field must be specified",
+ lambda: df.select(F.json_tuple(df.jstring)),
+ )
+
class FunctionsTests(ReusedSQLTestCase, FunctionsTestsMixin):
pass
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]