This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 41a2bebe725d [SPARK-48865][SQL] Add try_url_decode function
41a2bebe725d is described below

commit 41a2bebe725d7580ef848602223c76a824eb3c1f
Author: wforget <[email protected]>
AuthorDate: Wed Jul 17 21:50:16 2024 +0800

    [SPARK-48865][SQL] Add try_url_decode function
    
    ### What changes were proposed in this pull request?
    
    Add a `try_url_decode` function that performs the same operation as 
`url_decode`, but returns a NULL value instead of raising an error if the 
decoding cannot be performed.
    
    ### Why are the changes needed?
    
    In hive we usually do url decoding like: `reflect('java.net.URLDecoder', 
'decode', 'test%1')`, and return a `NULL` value instead of raising an error if 
the decoding cannot be performed.
    
    Although spark provides a `try_reflect` function to do this, but as 
commented in 
https://github.com/apache/spark/pull/34023#issuecomment-2113995703, the 
`reflect` function may cause partition pruning to does not take effect. So I 
propose to add a new `try_url_decode` function.
    
    ### Does this PR introduce _any_ user-facing change?
    
    add a new function
    
    ### How was this patch tested?
    
    added tests and did manual testing
    
    spark-sql:
    
![image](https://github.com/apache/spark/assets/17894939/0ffd3aa2-98f7-4af4-b478-67002b8b0d4b)
    
    pyspark:
    
![image](https://github.com/apache/spark/assets/17894939/d2c1926b-f9a0-422c-abc9-5f224d822811)
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #47294 from wForget/try_url_decode.
    
    Lead-authored-by: wforget <[email protected]>
    Co-authored-by: Kent Yao <[email protected]>
    Signed-off-by: Kent Yao <[email protected]>
---
 .../function_try_url_decode.explain                |   2 +
 .../queries/function_try_url_decode.json           |  25 ++++++++++++
 .../queries/function_try_url_decode.proto.bin      | Bin 0 -> 183 bytes
 .../scala/org/apache/spark/sql/functions.scala     |   9 +++++
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   4 ++
 .../source/reference/pyspark.sql/functions.rst     |   1 +
 python/pyspark/sql/connect/functions/builtin.py    |   7 ++++
 python/pyspark/sql/functions/builtin.py            |  45 +++++++++++++++++++++
 .../sql/catalyst/analysis/FunctionRegistry.scala   |   1 +
 .../sql/catalyst/expressions/urlExpressions.scala  |  31 ++++++++++++++
 .../scala/org/apache/spark/sql/functions.scala     |   9 +++++
 .../sql-functions/sql-expression-schema.md         |   1 +
 .../analyzer-results/url-functions.sql.out         |  28 +++++++++++++
 .../resources/sql-tests/inputs/url-functions.sql   |   8 +++-
 .../sql-tests/results/url-functions.sql.out        |  32 +++++++++++++++
 15 files changed, 202 insertions(+), 1 deletion(-)

diff --git 
a/connect/common/src/test/resources/query-tests/explain-results/function_try_url_decode.explain
 
b/connect/common/src/test/resources/query-tests/explain-results/function_try_url_decode.explain
new file mode 100644
index 000000000000..74b360a6b5f3
--- /dev/null
+++ 
b/connect/common/src/test/resources/query-tests/explain-results/function_try_url_decode.explain
@@ -0,0 +1,2 @@
+Project [tryeval(static_invoke(UrlCodec.decode(g#0, UTF-8))) AS 
try_url_decode(g)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git 
a/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json
 
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json
new file mode 100644
index 000000000000..d51704c8f62e
--- /dev/null
+++ 
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.json
@@ -0,0 +1,25 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": 
"struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "try_url_decode",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git 
a/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin
 
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin
new file mode 100644
index 000000000000..3e84921b1220
Binary files /dev/null and 
b/connect/common/src/test/resources/query-tests/queries/function_try_url_decode.proto.bin
 differ
diff --git 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 81f25b3d743f..02b25dd6cbb5 100644
--- 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4588,6 +4588,15 @@ object functions {
    */
   def url_decode(str: Column): Column = Column.fn("url_decode", str)
 
+  /**
+   * This is a special version of `url_decode` that performs the same 
operation, but returns a
+   * NULL value instead of raising an error if the decoding cannot be 
performed.
+   *
+   * @group url_funcs
+   * @since 4.0.0
+   */
+  def try_url_decode(str: Column): Column = Column.fn("try_url_decode", str)
+
   /**
    * Translates a string into 'application/x-www-form-urlencoded' format using 
a specific encoding
    * scheme.
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 18bae516489f..07b03c4564bc 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -2677,6 +2677,10 @@ class PlanGenerationTestSuite
     fn.url_decode(fn.col("g"))
   }
 
+  functionTest("try_url_decode") {
+    fn.try_url_decode(fn.col("g"))
+  }
+
   functionTest("url_encode") {
     fn.url_encode(fn.col("g"))
   }
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst 
b/python/docs/source/reference/pyspark.sql/functions.rst
index e0895959e893..c7ae525429ca 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -580,6 +580,7 @@ URL Functions
     parse_url
     url_decode
     url_encode
+    try_url_decode
 
 
 Misc Functions
diff --git a/python/pyspark/sql/connect/functions/builtin.py 
b/python/pyspark/sql/connect/functions/builtin.py
index 1f21e003bf01..dd81c11af292 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -2739,6 +2739,13 @@ def url_decode(str: "ColumnOrName") -> Column:
 url_decode.__doc__ = pysparkfuncs.url_decode.__doc__
 
 
+def try_url_decode(str: "ColumnOrName") -> Column:
+    return _invoke_function_over_columns("try_url_decode", str)
+
+
+try_url_decode.__doc__ = pysparkfuncs.try_url_decode.__doc__
+
+
 def url_encode(str: "ColumnOrName") -> Column:
     return _invoke_function_over_columns("url_encode", str)
 
diff --git a/python/pyspark/sql/functions/builtin.py 
b/python/pyspark/sql/functions/builtin.py
index 3193c3c4b574..d652e5923bce 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -12425,6 +12425,51 @@ def url_decode(str: "ColumnOrName") -> Column:
     return _invoke_function_over_columns("url_decode", str)
 
 
+@_try_remote_functions
+def try_url_decode(str: "ColumnOrName") -> Column:
+    """
+    This is a special version of `url_decode` that performs the same 
operation, but returns a
+    NULL value instead of raising an error if the decoding cannot be performed.
+
+    .. versionadded:: 4.0.0
+
+    Parameters
+    ----------
+    str : :class:`~pyspark.sql.Column` or str
+        A column of strings, each representing a URL-encoded string.
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        A new column of strings, each representing the decoded string.
+
+    Examples
+    --------
+    Example 1: Decoding a URL-encoded string
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], 
["url"])
+    >>> df.select(sf.try_url_decode(df.url)).show(truncate=False)
+    +------------------------+
+    |try_url_decode(url)     |
+    +------------------------+
+    |https://spark.apache.org|
+    +------------------------+
+
+    Example 2: Return NULL if the decoding cannot be performed.
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([("https%3A%2F%2spark.apache.org",)], 
["url"])
+    >>> df.select(sf.try_url_decode(df.url)).show()
+    +-------------------+
+    |try_url_decode(url)|
+    +-------------------+
+    |               NULL|
+    +-------------------+
+    """
+    return _invoke_function_over_columns("try_url_decode", str)
+
+
 @_try_remote_functions
 def url_encode(str: "ColumnOrName") -> Column:
     """
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8a5a32c173bb..d47a34c3626c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -461,6 +461,7 @@ object FunctionRegistry {
     expressionBuilder("try_to_timestamp", TryToTimestampExpressionBuilder, 
setAlias = true),
     expression[TryAesDecrypt]("try_aes_decrypt"),
     expression[TryReflect]("try_reflect"),
+    expression[TryUrlDecode]("try_url_decode"),
 
     // aggregate functions
     expression[HyperLogLogPlusPlus]("approx_count_distinct"),
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
index 1687b69e46be..c2b999f30161 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
@@ -106,6 +106,37 @@ case class UrlDecode(child: Expression)
   override def prettyName: String = "url_decode"
 }
 
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(str) - This is a special version of `url_decode` that performs the 
same operation, but returns a NULL value instead of raising an error if the 
decoding cannot be performed.
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression to decode
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('https%3A%2F%2Fspark.apache.org');
+       https://spark.apache.org
+  """,
+  since = "4.0.0",
+  group = "url_funcs")
+// scalastyle:on line.size.limit
+case class TryUrlDecode(expr: Expression, replacement: Expression)
+  extends RuntimeReplaceable with InheritAnalysisRules {
+
+  def this(expr: Expression) = this(expr, TryEval(UrlDecode(expr)))
+
+  override protected def withNewChildInternal(newChild: Expression): 
Expression = {
+    copy(replacement = newChild)
+  }
+
+  override def parameters: Seq[Expression] = Seq(expr)
+
+  override def prettyName: String = "try_url_decode"
+}
+
 object UrlCodec {
   def encode(src: UTF8String, enc: UTF8String): UTF8String = {
     UTF8String.fromString(URLEncoder.encode(src.toString, enc.toString))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5b4d27fc65d0..dd9b8cd26ad2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4548,6 +4548,15 @@ object functions {
    */
   def url_decode(str: Column): Column = Column.fn("url_decode", str)
 
+  /**
+   * This is a special version of `url_decode` that performs the same 
operation, but returns
+   * a NULL value instead of raising an error if the decoding cannot be 
performed.
+   *
+   * @group url_funcs
+   * @since 4.0.0
+   */
+  def try_url_decode(str: Column): Column = Column.fn("try_url_decode", str)
+
   /**
    * Translates a string into 'application/x-www-form-urlencoded' format
    * using a specific encoding scheme.
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md 
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index bf4622accf41..228c8f5df692 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -359,6 +359,7 @@
 | org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | 
SELECT try_to_binary('abc', 'utf-8') | struct<try_to_binary(abc, utf-8):binary> 
|
 | org.apache.spark.sql.catalyst.expressions.TryToNumber | try_to_number | 
SELECT try_to_number('454', '999') | struct<try_to_number(454, 
999):decimal(3,0)> |
 | org.apache.spark.sql.catalyst.expressions.TryToTimestampExpressionBuilder | 
try_to_timestamp | SELECT try_to_timestamp('2016-12-31 00:12:00') | 
struct<try_to_timestamp(2016-12-31 00:12:00):timestamp> |
+| org.apache.spark.sql.catalyst.expressions.TryUrlDecode | try_url_decode | 
SELECT try_url_decode('https%3A%2F%2Fspark.apache.org') | 
struct<try_url_decode(https%3A%2F%2Fspark.apache.org):string> |
 | org.apache.spark.sql.catalyst.expressions.TryValidateUTF8 | 
try_validate_utf8 | SELECT try_validate_utf8('Spark') | 
struct<try_validate_utf8(Spark):string> |
 | org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) 
| struct<typeof(1):string> |
 | org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT 
unbase64('U3BhcmsgU1FM') | struct<unbase64(U3BhcmsgU1FM):binary> |
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
index d93bbdb5400e..ca5984f93280 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/url-functions.sql.out
@@ -102,3 +102,31 @@ select url_decode(null)
 -- !query analysis
 Project [url_decode(cast(null as string)) AS url_decode(NULL)#x]
 +- OneRowRelation
+
+
+-- !query
+select try_url_decode('https%3A%2F%2Fspark.apache.org')
+-- !query analysis
+Project [try_url_decode(https%3A%2F%2Fspark.apache.org) AS 
try_url_decode(https%3A%2F%2Fspark.apache.org)#x]
++- OneRowRelation
+
+
+-- !query
+select try_url_decode('http%3A%2F%2spark.apache.org')
+-- !query analysis
+Project [try_url_decode(http%3A%2F%2spark.apache.org) AS 
try_url_decode(http%3A%2F%2spark.apache.org)#x]
++- OneRowRelation
+
+
+-- !query
+select try_url_decode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query analysis
+Project [try_url_decode(inva lid://user:pass@host/file\;param?query\;p2) AS 
try_url_decode(inva lid://user:pass@host/file\;param?query\;p2)#x]
++- OneRowRelation
+
+
+-- !query
+select try_url_decode(null)
+-- !query analysis
+Project [try_url_decode(null) AS try_url_decode(NULL)#x]
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql 
b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
index be69e5ffb879..222473feffe1 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
@@ -17,4 +17,10 @@ select url_encode(null);
 select url_decode('https%3A%2F%2Fspark.apache.org');
 select url_decode('http%3A%2F%2spark.apache.org');
 select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2');
-select url_decode(null);
\ No newline at end of file
+select url_decode(null);
+
+-- try_url_decode function
+select try_url_decode('https%3A%2F%2Fspark.apache.org');
+select try_url_decode('http%3A%2F%2spark.apache.org');
+select try_url_decode('inva lid://user:pass@host/file\\;param?query\\;p2');
+select try_url_decode(null);
diff --git 
a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
index 02b17f3356ff..2d1daee8500a 100644
--- a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
@@ -124,3 +124,35 @@ select url_decode(null)
 struct<url_decode(NULL):string>
 -- !query output
 NULL
+
+
+-- !query
+select try_url_decode('https%3A%2F%2Fspark.apache.org')
+-- !query schema
+struct<try_url_decode(https%3A%2F%2Fspark.apache.org):string>
+-- !query output
+https://spark.apache.org
+
+
+-- !query
+select try_url_decode('http%3A%2F%2spark.apache.org')
+-- !query schema
+struct<try_url_decode(http%3A%2F%2spark.apache.org):string>
+-- !query output
+NULL
+
+
+-- !query
+select try_url_decode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query schema
+struct<try_url_decode(inva lid://user:pass@host/file\;param?query\;p2):string>
+-- !query output
+inva lid://user:pass@host/file\;param?query\;p2
+
+
+-- !query
+select try_url_decode(null)
+-- !query schema
+struct<try_url_decode(NULL):string>
+-- !query output
+NULL


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to