This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5adcddb87a0 [SPARK-39695][SQL] Add the `REGEXP_SUBSTR` function
5adcddb87a0 is described below
commit 5adcddb87a052ce8e3b3c917c61f019bea5532ae
Author: Max Gekk <[email protected]>
AuthorDate: Thu Jul 7 11:22:41 2022 +0300
[SPARK-39695][SQL] Add the `REGEXP_SUBSTR` function
### What changes were proposed in this pull request?
In the PR, I propose to add new expression `RegExpSubStr` as a runtime
replaceable expression of `NullIf` and `RegExpExtract`. And bind the expression
to the function name `REGEXP_SUBSTR`. The `REGEXP_SUBSTR` function returns the
substring that matches a regular expression within a string. It takes two
parameters:
1. An expression that specifies the string in which the search is to take
place.
2. An expression that specifies the regular expression string that is the
pattern for the search.
If the regular expression is not found, the result is **null** (this
behaviour is similar to other DBMSs). When any of the input parameters are
NULL, the function returns NULL too.
For example:
```sql
spark-sql> CREATE TABLE log (logs string);
spark-sql> INSERT INTO log (logs) VALUES
> ('127.0.0.1 - - [10/Jan/2022:16:55:36 -0800] "GET / HTTP/1.0"
200 2217'),
> ('192.168.1.99 - - [14/Feb/2022:10:27:10 -0800] "GET
/cgi-bin/try/ HTTP/1.0" 200 3396');
spark-sql> SELECT REGEXP_SUBSTR
(logs,'\\b\\d{1,3}\.\\d{1,3}\.\\d{1,3}\.\\d{1,3}\\b') AS IP, REGEXP_SUBSTR
(logs,'([\\w:\/]+\\s[+\-]\\d{4})') AS DATE FROM log;
127.0.0.1 10/Jan/2022:16:55:36 -0800
192.168.1.99 14/Feb/2022:10:27:10 -0800
```
### Why are the changes needed?
To make the migration process from other systems to Spark SQL easier, and
achieve feature parity to such systems. For example, the systems below support
the `REGEXP_SUBSTR` function, see:
- Oracle:
https://docs.oracle.com/cd/B12037_01/server.101/b10759/functions116.htm
- DB2: https://www.ibm.com/docs/en/db2/11.5?topic=functions-regexp-substr
- Snowflake:
https://docs.snowflake.com/en/sql-reference/functions/regexp_substr.html
- BigQuery:
https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_substr
- Redshift:
https://docs.aws.amazon.com/redshift/latest/dg/REGEXP_SUBSTR.html
- MariaDB: https://mariadb.com/kb/en/regexp_substr/
- Exasol DB:
https://docs.exasol.com/db/latest/sql_references/functions/alphabeticallistfunctions/regexp_substr.htm
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By running new tests:
```
$ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z
regexp-functions.sql"
$ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
$ build/sbt "sql/test:testOnly
org.apache.spark.sql.expressions.ExpressionInfoSuite"
```
Closes #37101 from MaxGekk/regexp_substr.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../sql/catalyst/analysis/FunctionRegistry.scala | 1 +
.../catalyst/expressions/regexpExpressions.scala | 39 +++++++++++++++
.../sql-functions/sql-expression-schema.md | 1 +
.../sql-tests/inputs/regexp-functions.sql | 9 ++++
.../sql-tests/results/regexp-functions.sql.out | 56 ++++++++++++++++++++++
5 files changed, 106 insertions(+)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 52d84cfa175..20c719aec68 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -585,6 +585,7 @@ object FunctionRegistry {
expression[XPathShort]("xpath_short"),
expression[XPathString]("xpath_string"),
expression[RegExpCount]("regexp_count"),
+ expression[RegExpSubStr]("regexp_substr"),
// datetime functions
expression[AddMonths]("add_months"),
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 8d813058296..b240e849f4d 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -1004,3 +1004,42 @@ case class RegExpCount(left: Expression, right:
Expression)
newChildren: IndexedSeq[Expression]): RegExpCount =
copy(left = newChildren(0), right = newChildren(1))
}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """
+ _FUNC_(str, regexp) - Returns the substring that matches the regular
expression `regexp` within the string `str`. If the regular expression is not
found, the result is null.
+ """,
+ arguments = """
+ Arguments:
+ * str - a string expression.
+ * regexp - a string representing a regular expression. The regex string
should be a Java regular expression.
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players',
'Ste(v|ph)en');
+ Steven
+ > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players',
'Jeck');
+ NULL
+ """,
+ since = "3.4.0",
+ group = "string_funcs")
+// scalastyle:on line.size.limit
+case class RegExpSubStr(left: Expression, right: Expression)
+ extends RuntimeReplaceable with ImplicitCastInputTypes {
+
+ override lazy val replacement: Expression =
+ new NullIf(
+ RegExpExtract(subject = left, regexp = right, idx = Literal(0)),
+ Literal(""))
+
+ override def prettyName: String = "regexp_substr"
+
+ override def children: Seq[Expression] = Seq(left, right)
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+
+ override protected def withNewChildrenInternal(
+ newChildren: IndexedSeq[Expression]): RegExpSubStr =
+ copy(left = newChildren(0), right = newChildren(1))
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index e24ae5d0f7b..0305781a48d 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -241,6 +241,7 @@
| org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract |
SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) |
struct<regexp_extract(100-200, (\d+)-(\d+), 1):string> |
| org.apache.spark.sql.catalyst.expressions.RegExpExtractAll |
regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400',
'(\\d+)-(\\d+)', 1) | struct<regexp_extract_all(100-200, 300-400, (\d+)-(\d+),
1):array<string>> |
| org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace |
SELECT regexp_replace('100-200', '(\\d+)', 'num') |
struct<regexp_replace(100-200, (\d+), num, 1):string> |
+| org.apache.spark.sql.catalyst.expressions.RegExpSubStr | regexp_substr |
SELECT regexp_substr('Steven Jones and Stephen Smith are the best players',
'Ste(v|ph)en') | struct<regexp_substr(Steven Jones and Stephen Smith are
the best players, Ste(v|ph)en):string> |
| org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 |
struct<(2 % 1.8):decimal(2,1)> |
| org.apache.spark.sql.catalyst.expressions.Remainder | mod | SELECT 2 % 1.8 |
struct<(2 % 1.8):decimal(2,1)> |
| org.apache.spark.sql.catalyst.expressions.Reverse | reverse | SELECT
reverse('Spark SQL') | struct<reverse(Spark SQL):string> |
diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
index a48cfe68489..d828d761b77 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
@@ -60,3 +60,12 @@ SELECT regexp_count('the fox', '(?i)FOX');
SELECT regexp_count('passwd7 plain A1234 a1234', '(?=[^ ]*[a-z])(?=[^
]*[0-9])[^ ]+');
SELECT regexp_count(null, 'abc');
SELECT regexp_count('abc', null);
+
+-- regexp_substr
+SELECT regexp_substr('1a 2b 14m', '\\d+');
+SELECT regexp_substr('1a 2b 14m', '\\d+ ');
+SELECT regexp_substr('1a 2b 14m', '\\d+(a|b|m)');
+SELECT regexp_substr('1a 2b 14m', '\\d{2}(a|b|m)');
+SELECT regexp_substr('1a 2b 14m', '');
+SELECT regexp_substr('Spark', null);
+SELECT regexp_substr(null, '.*');
diff --git
a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
index a2eb2b2a148..c82b892f481 100644
--- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
@@ -435,3 +435,59 @@ SELECT regexp_count('abc', null)
struct<regexp_count(abc, NULL):int>
-- !query output
NULL
+
+
+-- !query
+SELECT regexp_substr('1a 2b 14m', '\\d+')
+-- !query schema
+struct<regexp_substr(1a 2b 14m, \d+):string>
+-- !query output
+1
+
+
+-- !query
+SELECT regexp_substr('1a 2b 14m', '\\d+ ')
+-- !query schema
+struct<regexp_substr(1a 2b 14m, \d+ ):string>
+-- !query output
+NULL
+
+
+-- !query
+SELECT regexp_substr('1a 2b 14m', '\\d+(a|b|m)')
+-- !query schema
+struct<regexp_substr(1a 2b 14m, \d+(a|b|m)):string>
+-- !query output
+1a
+
+
+-- !query
+SELECT regexp_substr('1a 2b 14m', '\\d{2}(a|b|m)')
+-- !query schema
+struct<regexp_substr(1a 2b 14m, \d{2}(a|b|m)):string>
+-- !query output
+14m
+
+
+-- !query
+SELECT regexp_substr('1a 2b 14m', '')
+-- !query schema
+struct<regexp_substr(1a 2b 14m, ):string>
+-- !query output
+NULL
+
+
+-- !query
+SELECT regexp_substr('Spark', null)
+-- !query schema
+struct<regexp_substr(Spark, NULL):string>
+-- !query output
+NULL
+
+
+-- !query
+SELECT regexp_substr(null, '.*')
+-- !query schema
+struct<regexp_substr(NULL, .*):string>
+-- !query output
+NULL
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]