This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 41a2bebe725d [SPARK-48865][SQL] Add try_url_decode function
41a2bebe725d is described below
commit 41a2bebe725d7580ef848602223c76a824eb3c1f
Author: wforget <[email protected]>
AuthorDate: Wed Jul 17 21:50:16 2024 +0800
[SPARK-48865][SQL] Add try_url_decode function
### What changes were proposed in this pull request?
Add a `try_url_decode` function that performs the same operation as
`url_decode`, but returns a NULL value instead of raising an error if the
decoding cannot be performed.
### Why are the changes needed?
In hive we usually do url decoding like: `reflect('java.net.URLDecoder',
'decode', 'test%1')`, and return a `NULL` value instead of raising an error if
the decoding cannot be performed.
Although spark provides a `try_reflect` function to do this, but as
commented in
https://github.com/apache/spark/pull/34023#issuecomment-2113995703, the
`reflect` function may cause partition pruning to does not take effect. So I
propose to add a new `try_url_decode` function.
### Does this PR introduce _any_ user-facing change?
add a new function
### How was this patch tested?
added tests and did manual testing
spark-sql:

pyspark:

### Was this patch authored or co-authored using generative AI tooling?
No
Closes #47294 from wForget/try_url_decode.
Lead-authored-by: wforget <[email protected]>
Co-authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
---
.../function_try_url_decode.explain | 2 +
.../queries/function_try_url_decode.json | 25 ++++++++++++
.../queries/function_try_url_decode.proto.bin | Bin 0 -> 183 bytes
.../scala/org/apache/spark/sql/functions.scala | 9 +++++
.../apache/spark/sql/PlanGenerationTestSuite.scala | 4 ++
.../source/reference/pyspark.sql/functions.rst | 1 +
python/pyspark/sql/connect/functions/builtin.py | 7 ++++
python/pyspark/sql/functions/builtin.py | 45 +++++++++++++++++++++
.../sql/catalyst/analysis/FunctionRegistry.scala | 1 +
.../sql/catalyst/expressions/urlExpressions.scala | 31 ++++++++++++++
.../scala/org/apache/spark/sql/functions.scala | 9 +++++
.../sql-functions/sql-expression-schema.md | 1 +
.../analyzer-results/url-functions.sql.out | 28 +++++++++++++
.../resources/sql-tests/inputs/url-functions.sql | 8 +++-
.../sql-tests/results/url-functions.sql.out | 32 +++++++++++++++
15 files changed, 202 insertions(+), 1 deletion(-)
diff --git
a/connect/common/src/test/resources/query-tests/explain-results/function_try_url_decode.explain
b/connect/common/src/test/resources/query-tests/explain-results/function_try_url_decode.explain
new file mode 100644
index 000000000000..74b360a6b5f3
--- /dev/null
+++
b/connect/common/src/test/resources/query-tests/explain-results/function_try_url_decode.explain
@@ -0,0 +1,2 @@
+Project [tryeval(static_invoke(UrlCodec.decode(g#0, UTF-8))) AS
try_url_decode(g)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json
new file mode 100644
index 000000000000..d51704c8f62e
--- /dev/null
+++
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json
@@ -0,0 +1,25 @@
+{
+ "common": {
+ "planId": "1"
+ },
+ "project": {
+ "input": {
+ "common": {
+ "planId": "0"
+ },
+ "localRelation": {
+ "schema":
"struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+ }
+ },
+ "expressions": [{
+ "unresolvedFunction": {
+ "functionName": "try_url_decode",
+ "arguments": [{
+ "unresolvedAttribute": {
+ "unparsedIdentifier": "g"
+ }
+ }]
+ }
+ }]
+ }
+}
\ No newline at end of file
diff --git
a/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin
new file mode 100644
index 000000000000..3e84921b1220
Binary files /dev/null and
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin
differ
diff --git
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 81f25b3d743f..02b25dd6cbb5 100644
---
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4588,6 +4588,15 @@ object functions {
*/
def url_decode(str: Column): Column = Column.fn("url_decode", str)
+ /**
+ * This is a special version of `url_decode` that performs the same
operation, but returns a
+ * NULL value instead of raising an error if the decoding cannot be
performed.
+ *
+ * @group url_funcs
+ * @since 4.0.0
+ */
+ def try_url_decode(str: Column): Column = Column.fn("try_url_decode", str)
+
/**
* Translates a string into 'application/x-www-form-urlencoded' format using
a specific encoding
* scheme.
diff --git
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 18bae516489f..07b03c4564bc 100644
---
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -2677,6 +2677,10 @@ class PlanGenerationTestSuite
fn.url_decode(fn.col("g"))
}
+ functionTest("try_url_decode") {
+ fn.try_url_decode(fn.col("g"))
+ }
+
functionTest("url_encode") {
fn.url_encode(fn.col("g"))
}
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst
b/python/docs/source/reference/pyspark.sql/functions.rst
index e0895959e893..c7ae525429ca 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -580,6 +580,7 @@ URL Functions
parse_url
url_decode
url_encode
+ try_url_decode
Misc Functions
diff --git a/python/pyspark/sql/connect/functions/builtin.py
b/python/pyspark/sql/connect/functions/builtin.py
index 1f21e003bf01..dd81c11af292 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -2739,6 +2739,13 @@ def url_decode(str: "ColumnOrName") -> Column:
url_decode.__doc__ = pysparkfuncs.url_decode.__doc__
+def try_url_decode(str: "ColumnOrName") -> Column:
+ return _invoke_function_over_columns("try_url_decode", str)
+
+
+try_url_decode.__doc__ = pysparkfuncs.try_url_decode.__doc__
+
+
def url_encode(str: "ColumnOrName") -> Column:
return _invoke_function_over_columns("url_encode", str)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 3193c3c4b574..d652e5923bce 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -12425,6 +12425,51 @@ def url_decode(str: "ColumnOrName") -> Column:
return _invoke_function_over_columns("url_decode", str)
+@_try_remote_functions
+def try_url_decode(str: "ColumnOrName") -> Column:
+ """
+ This is a special version of `url_decode` that performs the same
operation, but returns a
+ NULL value instead of raising an error if the decoding cannot be performed.
+
+ .. versionadded:: 4.0.0
+
+ Parameters
+ ----------
+ str : :class:`~pyspark.sql.Column` or str
+ A column of strings, each representing a URL-encoded string.
+
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ A new column of strings, each representing the decoded string.
+
+ Examples
+ --------
+ Example 1: Decoding a URL-encoded string
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)],
["url"])
+ >>> df.select(sf.try_url_decode(df.url)).show(truncate=False)
+ +------------------------+
+ |try_url_decode(url) |
+ +------------------------+
+ |https://spark.apache.org|
+ +------------------------+
+
+ Example 2: Return NULL if the decoding cannot be performed.
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([("https%3A%2F%2spark.apache.org",)],
["url"])
+ >>> df.select(sf.try_url_decode(df.url)).show()
+ +-------------------+
+ |try_url_decode(url)|
+ +-------------------+
+ | NULL|
+ +-------------------+
+ """
+ return _invoke_function_over_columns("try_url_decode", str)
+
+
@_try_remote_functions
def url_encode(str: "ColumnOrName") -> Column:
"""
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8a5a32c173bb..d47a34c3626c 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -461,6 +461,7 @@ object FunctionRegistry {
expressionBuilder("try_to_timestamp", TryToTimestampExpressionBuilder,
setAlias = true),
expression[TryAesDecrypt]("try_aes_decrypt"),
expression[TryReflect]("try_reflect"),
+ expression[TryUrlDecode]("try_url_decode"),
// aggregate functions
expression[HyperLogLogPlusPlus]("approx_count_distinct"),
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
index 1687b69e46be..c2b999f30161 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
@@ -106,6 +106,37 @@ case class UrlDecode(child: Expression)
override def prettyName: String = "url_decode"
}
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """
+ _FUNC_(str) - This is a special version of `url_decode` that performs the
same operation, but returns a NULL value instead of raising an error if the
decoding cannot be performed.
+ """,
+ arguments = """
+ Arguments:
+ * str - a string expression to decode
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_('https%3A%2F%2Fspark.apache.org');
+ https://spark.apache.org
+ """,
+ since = "4.0.0",
+ group = "url_funcs")
+// scalastyle:on line.size.limit
+case class TryUrlDecode(expr: Expression, replacement: Expression)
+ extends RuntimeReplaceable with InheritAnalysisRules {
+
+ def this(expr: Expression) = this(expr, TryEval(UrlDecode(expr)))
+
+ override protected def withNewChildInternal(newChild: Expression):
Expression = {
+ copy(replacement = newChild)
+ }
+
+ override def parameters: Seq[Expression] = Seq(expr)
+
+ override def prettyName: String = "try_url_decode"
+}
+
object UrlCodec {
def encode(src: UTF8String, enc: UTF8String): UTF8String = {
UTF8String.fromString(URLEncoder.encode(src.toString, enc.toString))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5b4d27fc65d0..dd9b8cd26ad2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4548,6 +4548,15 @@ object functions {
*/
def url_decode(str: Column): Column = Column.fn("url_decode", str)
+ /**
+ * This is a special version of `url_decode` that performs the same
operation, but returns
+ * a NULL value instead of raising an error if the decoding cannot be
performed.
+ *
+ * @group url_funcs
+ * @since 4.0.0
+ */
+ def try_url_decode(str: Column): Column = Column.fn("try_url_decode", str)
+
/**
* Translates a string into 'application/x-www-form-urlencoded' format
* using a specific encoding scheme.
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index bf4622accf41..228c8f5df692 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -359,6 +359,7 @@
| org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary |
SELECT try_to_binary('abc', 'utf-8') | struct<try_to_binary(abc, utf-8):binary>
|
| org.apache.spark.sql.catalyst.expressions.TryToNumber | try_to_number |
SELECT try_to_number('454', '999') | struct<try_to_number(454,
999):decimal(3,0)> |
| org.apache.spark.sql.catalyst.expressions.TryToTimestampExpressionBuilder |
try_to_timestamp | SELECT try_to_timestamp('2016-12-31 00:12:00') |
struct<try_to_timestamp(2016-12-31 00:12:00):timestamp> |
+| org.apache.spark.sql.catalyst.expressions.TryUrlDecode | try_url_decode |
SELECT try_url_decode('https%3A%2F%2Fspark.apache.org') |
struct<try_url_decode(https%3A%2F%2Fspark.apache.org):string> |
| org.apache.spark.sql.catalyst.expressions.TryValidateUTF8 |
try_validate_utf8 | SELECT try_validate_utf8('Spark') |
struct<try_validate_utf8(Spark):string> |
| org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1)
| struct<typeof(1):string> |
| org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT
unbase64('U3BhcmsgU1FM') | struct<unbase64(U3BhcmsgU1FM):binary> |
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
index d93bbdb5400e..ca5984f93280 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
@@ -102,3 +102,31 @@ select url_decode(null)
-- !query analysis
Project [url_decode(cast(null as string)) AS url_decode(NULL)#x]
+- OneRowRelation
+
+
+-- !query
+select try_url_decode('https%3A%2F%2Fspark.apache.org')
+-- !query analysis
+Project [try_url_decode(https%3A%2F%2Fspark.apache.org) AS
try_url_decode(https%3A%2F%2Fspark.apache.org)#x]
++- OneRowRelation
+
+
+-- !query
+select try_url_decode('http%3A%2F%2spark.apache.org')
+-- !query analysis
+Project [try_url_decode(http%3A%2F%2spark.apache.org) AS
try_url_decode(http%3A%2F%2spark.apache.org)#x]
++- OneRowRelation
+
+
+-- !query
+select try_url_decode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query analysis
+Project [try_url_decode(inva lid://user:pass@host/file\;param?query\;p2) AS
try_url_decode(inva lid://user:pass@host/file\;param?query\;p2)#x]
++- OneRowRelation
+
+
+-- !query
+select try_url_decode(null)
+-- !query analysis
+Project [try_url_decode(null) AS try_url_decode(NULL)#x]
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
index be69e5ffb879..222473feffe1 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
@@ -17,4 +17,10 @@ select url_encode(null);
select url_decode('https%3A%2F%2Fspark.apache.org');
select url_decode('http%3A%2F%2spark.apache.org');
select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2');
-select url_decode(null);
\ No newline at end of file
+select url_decode(null);
+
+-- try_url_decode function
+select try_url_decode('https%3A%2F%2Fspark.apache.org');
+select try_url_decode('http%3A%2F%2spark.apache.org');
+select try_url_decode('inva lid://user:pass@host/file\\;param?query\\;p2');
+select try_url_decode(null);
diff --git
a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
index 02b17f3356ff..2d1daee8500a 100644
--- a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
@@ -124,3 +124,35 @@ select url_decode(null)
struct<url_decode(NULL):string>
-- !query output
NULL
+
+
+-- !query
+select try_url_decode('https%3A%2F%2Fspark.apache.org')
+-- !query schema
+struct<try_url_decode(https%3A%2F%2Fspark.apache.org):string>
+-- !query output
+https://spark.apache.org
+
+
+-- !query
+select try_url_decode('http%3A%2F%2spark.apache.org')
+-- !query schema
+struct<try_url_decode(http%3A%2F%2spark.apache.org):string>
+-- !query output
+NULL
+
+
+-- !query
+select try_url_decode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query schema
+struct<try_url_decode(inva lid://user:pass@host/file\;param?query\;p2):string>
+-- !query output
+inva lid://user:pass@host/file\;param?query\;p2
+
+
+-- !query
+select try_url_decode(null)
+-- !query schema
+struct<try_url_decode(NULL):string>
+-- !query output
+NULL
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]