This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e3b1bb117fe9 [SPARK-45262][SQL][TESTS][DOCS] Improve examples for
regexp parameters
e3b1bb117fe9 is described below
commit e3b1bb117fe9bf0b17321e6359b7aa90f70a24b5
Author: Max Gekk <[email protected]>
AuthorDate: Fri Oct 6 22:34:40 2023 +0300
[SPARK-45262][SQL][TESTS][DOCS] Improve examples for regexp parameters
### What changes were proposed in this pull request?
In the PR, I propose to add a few more examples for `LIKE`, `ILIKE`,
`RLIKE`, `regexp_instr()`, `regexp_extract_all()` that highlight correctness of
current description and test a couple more of corner cases.
### Why are the changes needed?
The description of `LIKE` says:
```
... in order to match "\abc", the pattern should be "\\abc"
```
but in Spark SQL shell:
```sql
spark-sql (default)> SELECT c FROM t;
\abc
spark-sql (default)> SELECT c LIKE "\\abc" FROM t;
[INVALID_FORMAT.ESC_IN_THE_MIDDLE] The format is invalid: '\\abc'. The
escape character is not allowed to precede 'a'.
spark-sql (default)> SELECT c LIKE "\\\\abc" FROM t;
true
```
So, the description might confuse users since the pattern must contain 4
slashes when the pattern is a regular SQL string.
New example shows that the pattern "\\abc" is correct if we take into
account the string as a raw string:
```sql
spark-sql (default)> SELECT c LIKE R"\\abc" FROM t;
true
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By running new and modified tests:
```
$ build/sbt "test:testOnly *.StringFunctionsSuite"
$ build/sbt "sql/test:testOnly
org.apache.spark.sql.expressions.ExpressionInfoSuite"
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43037 from MaxGekk/fix-like-doc.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../sql/catalyst/expressions/regexpExpressions.scala | 18 ++++++++++++++++--
.../resources/sql-functions/sql-expression-schema.md | 2 +-
.../org/apache/spark/sql/StringFunctionsSuite.scala | 5 +++++
3 files changed, 22 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 87ea8b5a102a..b33de303b5d5 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -108,13 +108,15 @@ abstract class StringRegexExpression extends
BinaryExpression
Examples:
> SELECT _FUNC_('Spark', '_park');
true
+ > SELECT '\\abc' AS S, S _FUNC_ r'\\abc', S _FUNC_ '\\\\abc';
+ \abc true true
> SET spark.sql.parser.escapedStringLiterals=true;
spark.sql.parser.escapedStringLiterals true
> SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%';
true
> SET spark.sql.parser.escapedStringLiterals=false;
spark.sql.parser.escapedStringLiterals false
- > SELECT '%SystemDrive%\\Users\\John' _FUNC_ '\%SystemDrive\%\\\\Users%';
+ > SELECT '%SystemDrive%\\Users\\John' _FUNC_ r'%SystemDrive%\\Users%';
true
> SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SystemDrive/%//Users%'
ESCAPE '/';
true
@@ -226,13 +228,15 @@ case class Like(left: Expression, right: Expression,
escapeChar: Char)
Examples:
> SELECT _FUNC_('Spark', '_Park');
true
+ > SELECT '\\abc' AS S, S _FUNC_ r'\\abc', S _FUNC_ '\\\\abc';
+ \abc true true
> SET spark.sql.parser.escapedStringLiterals=true;
spark.sql.parser.escapedStringLiterals true
> SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\users%';
true
> SET spark.sql.parser.escapedStringLiterals=false;
spark.sql.parser.escapedStringLiterals false
- > SELECT '%SystemDrive%\\USERS\\John' _FUNC_ '\%SystemDrive\%\\\\Users%';
+ > SELECT '%SystemDrive%\\USERS\\John' _FUNC_ r'%SystemDrive%\\Users%';
true
> SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SYSTEMDrive/%//Users%'
ESCAPE '/';
true
@@ -446,6 +450,8 @@ case class NotLikeAny(child: Expression, patterns:
Seq[UTF8String]) extends Like
spark.sql.parser.escapedStringLiterals false
> SELECT _FUNC_('%SystemDrive%\\Users\\John',
'%SystemDrive%\\\\Users.*');
true
+ > SELECT _FUNC_('%SystemDrive%\\Users\\John', r'%SystemDrive%\\Users.*');
+ true
""",
note = """
Use LIKE to match with simple string pattern.
@@ -596,6 +602,8 @@ case class StringSplit(str: Expression, regex: Expression,
limit: Expression)
Examples:
> SELECT _FUNC_('100-200', '(\\d+)', 'num');
num-num
+ > SELECT _FUNC_('100-200', r'(\d+)', 'num');
+ num-num
""",
since = "1.5.0",
group = "string_funcs")
@@ -813,6 +821,8 @@ abstract class RegExpExtractBase
Examples:
> SELECT _FUNC_('100-200', '(\\d+)-(\\d+)', 1);
100
+ > SELECT _FUNC_('100-200', r'(\d+)-(\d+)', 1);
+ 100
""",
since = "1.5.0",
group = "string_funcs")
@@ -909,6 +919,8 @@ case class RegExpExtract(subject: Expression, regexp:
Expression, idx: Expressio
Examples:
> SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1);
["100","300"]
+ > SELECT _FUNC_('100-200, 300-400', r'(\d+)-(\d+)', 1);
+ ["100","300"]
""",
since = "3.1.0",
group = "string_funcs")
@@ -1075,6 +1087,8 @@ case class RegExpSubStr(left: Expression, right:
Expression)
""",
examples = """
Examples:
+ > SELECT _FUNC_(r"\abc", r"^\\abc$");
+ 1
> SELECT _FUNC_('[email protected]', '@[^.]*');
5
""",
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 89e840d12428..1573b5c56086 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -262,7 +262,7 @@
| org.apache.spark.sql.catalyst.expressions.RegExpCount | regexp_count |
SELECT regexp_count('Steven Jones and Stephen Smith are the best players',
'Ste(v|ph)en') | struct<regexp_count(Steven Jones and Stephen Smith are
the best players, Ste(v|ph)en):int> |
| org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract |
SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) |
struct<regexp_extract(100-200, (\d+)-(\d+), 1):string> |
| org.apache.spark.sql.catalyst.expressions.RegExpExtractAll |
regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400',
'(\\d+)-(\\d+)', 1) | struct<regexp_extract_all(100-200, 300-400, (\d+)-(\d+),
1):array<string>> |
-| org.apache.spark.sql.catalyst.expressions.RegExpInStr | regexp_instr |
SELECT regexp_instr('[email protected]', '@[^.]*') |
struct<regexp_instr([email protected], @[^.]*, 0):int> |
+| org.apache.spark.sql.catalyst.expressions.RegExpInStr | regexp_instr |
SELECT regexp_instr(r"\abc", r"^\\abc$") | struct<regexp_instr(\abc, ^\\abc$,
0):int> |
| org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace |
SELECT regexp_replace('100-200', '(\\d+)', 'num') |
struct<regexp_replace(100-200, (\d+), num, 1):string> |
| org.apache.spark.sql.catalyst.expressions.RegExpSubStr | regexp_substr |
SELECT regexp_substr('Steven Jones and Stephen Smith are the best players',
'Ste(v|ph)en') | struct<regexp_substr(Steven Jones and Stephen Smith are
the best players, Ste(v|ph)en):string> |
| org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 |
struct<(2 % 1.8):decimal(2,1)> |
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 8e9be5dcdced..422498ac9dc6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -998,6 +998,11 @@ class StringFunctionsSuite extends QueryTest with
SharedSparkSession {
checkAnswer(df.selectExpr("a ilike b escape '/'"), Seq(Row(true)))
checkAnswer(df.select(ilike(col("a"), col("b"), lit('/'))), Seq(Row(true)))
+ val df2 = Seq(("""abc\""", """%\\""")).toDF("i", "p")
+ checkAnswer(df2.select(like(col("i"), col("p"))), Seq(Row(true)))
+ val df3 = Seq(("""\abc""", """\\abc""")).toDF("i", "p")
+ checkAnswer(df3.select(like(col("i"), col("p"))), Seq(Row(true)))
+
checkError(
exception = intercept[AnalysisException] {
df1.select(like(col("a"), col("b"), lit(618))).collect()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]