[spark] branch master updated: [SPARK-45262][SQL][TESTS][DOCS] Improve examples for regexp parameters

maxgekk Fri, 06 Oct 2023 12:35:01 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new e3b1bb117fe9 [SPARK-45262][SQL][TESTS][DOCS] Improve examples for 
regexp parameters
e3b1bb117fe9 is described below

commit e3b1bb117fe9bf0b17321e6359b7aa90f70a24b5
Author: Max Gekk <[email protected]>
AuthorDate: Fri Oct 6 22:34:40 2023 +0300

    [SPARK-45262][SQL][TESTS][DOCS] Improve examples for regexp parameters
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to add a few more examples for `LIKE`, `ILIKE`, 
`RLIKE`, `regexp_instr()`, `regexp_extract_all()` that highlight correctness of 
current description and test a couple more of corner cases.
    
    ### Why are the changes needed?
    The description of `LIKE` says:
    ```
    ... in order to match "\abc", the pattern should be "\\abc"
    ```
    but in Spark SQL shell:
    ```sql
    spark-sql (default)> SELECT c FROM t;
    \abc
    spark-sql (default)> SELECT c LIKE "\\abc" FROM t;
    [INVALID_FORMAT.ESC_IN_THE_MIDDLE] The format is invalid: '\\abc'. The 
escape character is not allowed to precede 'a'.
    spark-sql (default)> SELECT c LIKE "\\\\abc" FROM t;
    true
    ```
    So, the description might confuse users since the pattern must contain 4 
slashes when the pattern is a regular SQL string.
    
    New example shows that the pattern "\\abc" is correct if we take into 
account the string as a raw string:
    ```sql
    spark-sql (default)> SELECT c LIKE R"\\abc" FROM t;
    true
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    By running new and modified tests:
    ```
    $ build/sbt "test:testOnly *.StringFunctionsSuite"
    $ build/sbt "sql/test:testOnly 
org.apache.spark.sql.expressions.ExpressionInfoSuite"
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #43037 from MaxGekk/fix-like-doc.
    
    Authored-by: Max Gekk <[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../sql/catalyst/expressions/regexpExpressions.scala   | 18 ++++++++++++++++--
 .../resources/sql-functions/sql-expression-schema.md   |  2 +-
 .../org/apache/spark/sql/StringFunctionsSuite.scala    |  5 +++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 87ea8b5a102a..b33de303b5d5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -108,13 +108,15 @@ abstract class StringRegexExpression extends 
BinaryExpression
     Examples:
       > SELECT _FUNC_('Spark', '_park');
       true
+      > SELECT '\\abc' AS S, S _FUNC_ r'\\abc', S _FUNC_ '\\\\abc';
+      \abc     true    true
       > SET spark.sql.parser.escapedStringLiterals=true;
       spark.sql.parser.escapedStringLiterals   true
       > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%';
       true
       > SET spark.sql.parser.escapedStringLiterals=false;
       spark.sql.parser.escapedStringLiterals   false
-      > SELECT '%SystemDrive%\\Users\\John' _FUNC_ '\%SystemDrive\%\\\\Users%';
+      > SELECT '%SystemDrive%\\Users\\John' _FUNC_ r'%SystemDrive%\\Users%';
       true
       > SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SystemDrive/%//Users%' 
ESCAPE '/';
       true
@@ -226,13 +228,15 @@ case class Like(left: Expression, right: Expression, 
escapeChar: Char)
     Examples:
       > SELECT _FUNC_('Spark', '_Park');
       true
+      > SELECT '\\abc' AS S, S _FUNC_ r'\\abc', S _FUNC_ '\\\\abc';
+      \abc     true    true
       > SET spark.sql.parser.escapedStringLiterals=true;
       spark.sql.parser.escapedStringLiterals   true
       > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\users%';
       true
       > SET spark.sql.parser.escapedStringLiterals=false;
       spark.sql.parser.escapedStringLiterals   false
-      > SELECT '%SystemDrive%\\USERS\\John' _FUNC_ '\%SystemDrive\%\\\\Users%';
+      > SELECT '%SystemDrive%\\USERS\\John' _FUNC_ r'%SystemDrive%\\Users%';
       true
       > SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SYSTEMDrive/%//Users%' 
ESCAPE '/';
       true
@@ -446,6 +450,8 @@ case class NotLikeAny(child: Expression, patterns: 
Seq[UTF8String]) extends Like
       spark.sql.parser.escapedStringLiterals   false
       > SELECT _FUNC_('%SystemDrive%\\Users\\John', 
'%SystemDrive%\\\\Users.*');
       true
+      > SELECT _FUNC_('%SystemDrive%\\Users\\John', r'%SystemDrive%\\Users.*');
+      true
   """,
   note = """
     Use LIKE to match with simple string pattern.
@@ -596,6 +602,8 @@ case class StringSplit(str: Expression, regex: Expression, 
limit: Expression)
     Examples:
       > SELECT _FUNC_('100-200', '(\\d+)', 'num');
        num-num
+      > SELECT _FUNC_('100-200', r'(\d+)', 'num');
+       num-num
   """,
   since = "1.5.0",
   group = "string_funcs")
@@ -813,6 +821,8 @@ abstract class RegExpExtractBase
     Examples:
       > SELECT _FUNC_('100-200', '(\\d+)-(\\d+)', 1);
        100
+      > SELECT _FUNC_('100-200', r'(\d+)-(\d+)', 1);
+       100
   """,
   since = "1.5.0",
   group = "string_funcs")
@@ -909,6 +919,8 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
     Examples:
       > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
        ["100","300"]
+      > SELECT _FUNC_('100-200, 300-400', r'(\d+)-(\d+)', 1);
+       ["100","300"]
   """,
   since = "3.1.0",
   group = "string_funcs")
@@ -1075,6 +1087,8 @@ case class RegExpSubStr(left: Expression, right: 
Expression)
   """,
   examples = """
     Examples:
+      > SELECT _FUNC_(r"\abc", r"^\\abc$");
+       1
       > SELECT _FUNC_('[email protected]', '@[^.]*');
        5
   """,
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md 
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 89e840d12428..1573b5c56086 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -262,7 +262,7 @@
 | org.apache.spark.sql.catalyst.expressions.RegExpCount | regexp_count | 
SELECT regexp_count('Steven Jones and Stephen Smith are the best players', 
'Ste(v&#124;ph)en') | struct<regexp_count(Steven Jones and Stephen Smith are 
the best players, Ste(v&#124;ph)en):int> |
 | org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract | 
SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) | 
struct<regexp_extract(100-200, (\d+)-(\d+), 1):string> |
 | org.apache.spark.sql.catalyst.expressions.RegExpExtractAll | 
regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400', 
'(\\d+)-(\\d+)', 1) | struct<regexp_extract_all(100-200, 300-400, (\d+)-(\d+), 
1):array<string>> |
-| org.apache.spark.sql.catalyst.expressions.RegExpInStr | regexp_instr | 
SELECT regexp_instr('[email protected]', '@[^.]*') | 
struct<regexp_instr([email protected], @[^.]*, 0):int> |
+| org.apache.spark.sql.catalyst.expressions.RegExpInStr | regexp_instr | 
SELECT regexp_instr(r"\abc", r"^\\abc$") | struct<regexp_instr(\abc, ^\\abc$, 
0):int> |
 | org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace | 
SELECT regexp_replace('100-200', '(\\d+)', 'num') | 
struct<regexp_replace(100-200, (\d+), num, 1):string> |
 | org.apache.spark.sql.catalyst.expressions.RegExpSubStr | regexp_substr | 
SELECT regexp_substr('Steven Jones and Stephen Smith are the best players', 
'Ste(v&#124;ph)en') | struct<regexp_substr(Steven Jones and Stephen Smith are 
the best players, Ste(v&#124;ph)en):string> |
 | org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 | 
struct<(2 % 1.8):decimal(2,1)> |
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 8e9be5dcdced..422498ac9dc6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -998,6 +998,11 @@ class StringFunctionsSuite extends QueryTest with 
SharedSparkSession {
     checkAnswer(df.selectExpr("a ilike b escape '/'"), Seq(Row(true)))
     checkAnswer(df.select(ilike(col("a"), col("b"), lit('/'))), Seq(Row(true)))
 
+    val df2 = Seq(("""abc\""", """%\\""")).toDF("i", "p")
+    checkAnswer(df2.select(like(col("i"), col("p"))), Seq(Row(true)))
+    val df3 = Seq(("""\abc""", """\\abc""")).toDF("i", "p")
+    checkAnswer(df3.select(like(col("i"), col("p"))), Seq(Row(true)))
+
     checkError(
       exception = intercept[AnalysisException] {
         df1.select(like(col("a"), col("b"), lit(618))).collect()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-45262][SQL][TESTS][DOCS] Improve examples for regexp parameters

Reply via email to