This is an automated email from the ASF dual-hosted git repository.
tanner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push:
new ada4fe2715 [CALCITE-5911] Add REGEXP_EXTRACT_ALL function (enabled in
BigQuery library)
ada4fe2715 is described below
commit ada4fe2715aac8e30751cbcfb84e0b60d89db4ee
Author: Jerin John <[email protected]>
AuthorDate: Tue Jul 25 15:29:31 2023 -0700
[CALCITE-5911] Add REGEXP_EXTRACT_ALL function (enabled in BigQuery library)
---
babel/src/test/resources/sql/big-query.iq | 59 ++++++++++++++++++++
.../calcite/adapter/enumerable/RexImpTable.java | 2 +
.../org/apache/calcite/runtime/SqlFunctions.java | 65 ++++++++++++++--------
.../calcite/sql/fun/SqlLibraryOperators.java | 9 +++
.../org/apache/calcite/util/BuiltInMethod.java | 2 +
.../org/apache/calcite/test/SqlFunctionsTest.java | 29 ++++++++++
site/_docs/reference.md | 1 +
.../org/apache/calcite/test/SqlOperatorTest.java | 23 ++++++++
8 files changed, 167 insertions(+), 23 deletions(-)
diff --git a/babel/src/test/resources/sql/big-query.iq
b/babel/src/test/resources/sql/big-query.iq
index 353b7c518c..f38032ae32 100755
--- a/babel/src/test/resources/sql/big-query.iq
+++ b/babel/src/test/resources/sql/big-query.iq
@@ -1114,6 +1114,65 @@ SELECT REGEXP_EXTRACT("abcadcabcaecghi", "a.c", 3, 0);
Invalid integer input '0' for argument 'occurrence' in REGEXP_EXTRACT
!error
+#####################################################################
+# REGEXP_EXTRACT_ALL(value, regexp)
+#
+# Returns an array of all substrings in value that matches the regexp.
+# Returns an empty array if there is no match.
+# Returns an exception if regex is invalid or has more than one capturing
group.
+
+WITH code_markdown AS
+ (SELECT 'Try `function(x)` or `function(y)`' as code)
+SELECT
+ REGEXP_EXTRACT_ALL(code, '`(.+?)`') AS example
+FROM code_markdown;
++----------------------------+
+| example |
++----------------------------+
+| [function(x), function(y)] |
++----------------------------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "abc(a.c)");
++------------+
+| EXPR$0 |
++------------+
+| [adc, aec] |
++------------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("abacadaeafa", "a.a");
++-----------------+
+| EXPR$0 |
++-----------------+
+| [aba, ada, afa] |
++-----------------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("12345abc123", "a[0-9]");
++--------+
+| EXPR$0 |
++--------+
+| [] |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("abc def ghi", "{4,1}");
+Invalid regular expression for REGEXP_EXTRACT_ALL: 'Illegal repetition range
near index 4 {4,1} ^'
+!error
+
+SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "(a.c).(.*)$");
+Multiple capturing groups (count=2) not allowed in regex input for
REGEXP_EXTRACT_ALL
+!error
+
#####################################################################
# REGEXP_SUBSTR(value, regexp[, position[, occurrence]])
#
diff --git
a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index b9aac10a35..b4a5cb8ed9 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -220,6 +220,7 @@ import static
org.apache.calcite.sql.fun.SqlLibraryOperators.PARSE_URL;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.POW;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT;
+import static
org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
@@ -573,6 +574,7 @@ public class RexImpTable {
defineReflective(REGEXP_CONTAINS, BuiltInMethod.REGEXP_CONTAINS.method);
defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
BuiltInMethod.REGEXP_EXTRACT3.method,
BuiltInMethod.REGEXP_EXTRACT4.method);
+ defineReflective(REGEXP_EXTRACT_ALL,
BuiltInMethod.REGEXP_EXTRACT_ALL.method);
map.put(TRIM, new TrimImplementor());
diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
index 0afe39748c..c64b70ad18 100644
--- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
+++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
@@ -388,22 +388,34 @@ public class SqlFunctions {
.maximumSize(FUNCTION_LEVEL_CACHE_MAX_SIZE.value())
.build(CacheLoader.from(Key::toPattern));
- /** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
- * Throws a runtime exception for invalid regular expressions.*/
- public boolean regexpContains(String value, String regex) {
- final Pattern pattern;
+ /** Helper for regex validation in REGEXP_* fns. */
+ private Pattern validateRegexPattern(String regex, String methodName) {
try {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
- pattern = cache.getUnchecked(new Key(0, regex));
+ return cache.getUnchecked(new Key(0, regex));
} catch (UncheckedExecutionException e) {
if (e.getCause() instanceof PatternSyntaxException) {
throw RESOURCE.invalidRegexInputForRegexpFunctions(
requireNonNull(e.getCause().getMessage(), "message")
- .replace(System.lineSeparator(), " "),
"REGEXP_CONTAINS").ex();
+ .replace(System.lineSeparator(), " "), methodName).ex();
}
throw e;
}
+ }
+
+ /** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns.
*/
+ private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String
methodName) {
+ if (matcher.groupCount() > 1) {
+ throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
+ Integer.toString(matcher.groupCount()), methodName).ex();
+ }
+ }
+
+ /** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
+ * Throws a runtime exception for invalid regular expressions.*/
+ public boolean regexpContains(String value, String regex) {
+ final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS");
return pattern.matcher(value).find();
}
@@ -429,18 +441,8 @@ public class SqlFunctions {
int occurrence) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
- final Pattern pattern;
- String methodName = "REGEXP_EXTRACT";
- try {
- pattern = cache.getUnchecked(new Key(0, regex));
- } catch (UncheckedExecutionException e) {
- if (e.getCause() instanceof PatternSyntaxException) {
- throw RESOURCE.invalidRegexInputForRegexpFunctions(
- requireNonNull(e.getCause().getMessage(), "message")
- .replace(System.lineSeparator(), " "), methodName).ex();
- }
- throw e;
- }
+ final String methodName = "REGEXP_EXTRACT";
+ final Pattern pattern = validateRegexPattern(regex, methodName);
if (position <= 0) {
throw
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
@@ -456,13 +458,9 @@ public class SqlFunctions {
}
Matcher matcher = pattern.matcher(value);
+ checkMultipleCapturingGroupsInRegex(matcher, methodName);
matcher.region(position - 1, value.length());
- if (matcher.groupCount() > 1) {
- throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
- Integer.toString(matcher.groupCount()), methodName).ex();
- }
-
String match = null;
while (occurrence > 0) {
if (matcher.find()) {
@@ -476,6 +474,27 @@ public class SqlFunctions {
return match;
}
+ /** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function.
+ * Returns an empty array if there is no match, returns an exception if
regex is invalid.*/
+ public List<String> regexpExtractAll(String value, String regex) {
+ // Uses java.util.regex as a standard for regex processing
+ // in Calcite instead of RE2 used by BigQuery/GoogleSQL
+ final String methodName = "REGEXP_EXTRACT_ALL";
+ final Pattern regexp = validateRegexPattern(regex, methodName);
+
+ Matcher matcher = regexp.matcher(value);
+ checkMultipleCapturingGroupsInRegex(matcher, methodName);
+
+ ImmutableList.Builder<String> matches = ImmutableList.builder();
+ while (matcher.find()) {
+ String match = matcher.group(matcher.groupCount());
+ if (match != null && !match.isEmpty()) {
+ matches.add(match);
+ }
+ }
+ return matches.build();
+ }
+
/** SQL {@code REGEXP_REPLACE} function with 3 arguments. */
public String regexpReplace(String s, String regex,
String replacement) {
diff --git
a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index eba88e93d7..21503e4b73 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -491,6 +491,15 @@ public abstract class SqlLibraryOperators {
OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER,
SqlFunctionCategory.STRING);
+ /** The "REGEXP_EXTRACT_ALL(value, regexp)" function.
+ * Returns the substring in value that matches the regexp. Returns NULL if
there is no match. */
+ @LibraryOperator(libraries = {BIG_QUERY})
+ public static final SqlBasicFunction REGEXP_EXTRACT_ALL =
+ SqlBasicFunction.create("REGEXP_EXTRACT_ALL", ReturnTypes.ARG0_NULLABLE
+ .andThen(SqlTypeTransforms.TO_ARRAY),
+ OperandTypes.STRING_STRING,
+ SqlFunctionCategory.STRING);
+
@LibraryOperator(libraries = {MYSQL, ORACLE})
public static final SqlFunction REGEXP_REPLACE = new
SqlRegexpReplaceFunction();
diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
index cdc9fbc83f..53edb703e4 100644
--- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
+++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
@@ -529,6 +529,8 @@ public enum BuiltInMethod {
String.class, String.class, int.class),
REGEXP_EXTRACT4(SqlFunctions.RegexFunction.class, "regexpExtract",
String.class, String.class, int.class, int.class),
+ REGEXP_EXTRACT_ALL(SqlFunctions.RegexFunction.class, "regexpExtractAll",
+ String.class, String.class),
REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace",
String.class, String.class, String.class),
REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace",
diff --git a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
index 726c047525..52a33b0e8f 100644
--- a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
+++ b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
@@ -338,6 +338,35 @@ class SqlFunctionsTest {
}
}
+ @Test void testRegexpExtractAll() {
+ final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
+
+ assertThat(f.regexpExtractAll("abcadcabcaecghi", "ac"), is(list()));
+ assertThat(f.regexpExtractAll("abcadcabcaecghi", "abc(a.c)"),
is(list("adc", "aec")));
+ assertThat(f.regexpExtractAll("abcadcabcaecghi", "a.c"), is(list("abc",
"adc", "abc", "aec")));
+ assertThat(f.regexpExtractAll("banana", "ana"), is(list("ana")));
+ assertThat(f.regexpExtractAll("abacadaeafa", "a.a"), is(list("aba", "ada",
"afa")));
+ assertThat(f.regexpExtractAll("abcdefghijklmnop", ".+"),
is(list("abcdefghijklmnop")));
+
+ try {
+ final List<String> s = f.regexpExtractAll("abc def ghi", "(abc");
+ fail("expected error, got array: " + s);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Invalid regular expression for REGEXP_EXTRACT_ALL: 'Unclosed
group near index 4 "
+ + "(abc'"));
+ }
+
+ try {
+ final List<String> s = f.regexpExtractAll("abcadcabcaecghi",
"(abc).(ax).(a.c)");
+ fail("expected error, got array:" + s);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Multiple capturing groups (count=3) not allowed in regex input
for "
+ + "REGEXP_EXTRACT_ALL"));
+ }
+ }
+
@Test void testRegexpReplace() {
final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
assertThat(f.regexpReplace("a b c", "b", "X"), is("a X c"));
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 834d4f6174..904f30df24 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -2786,6 +2786,7 @@ BigQuery's type system uses confusingly different names
for types and functions:
| b | POW(numeric1, numeric2) | Returns *numeric1*
raised to the power *numeric2*
| b | REGEXP_CONTAINS(string, regexp) | Returns whether
*string* is a partial match for the *regexp*
| b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the
substring in *string* that matches the *regexp*, starting search at *position*
(default 1), and until locating the nth *occurrence* (default 1). Returns NULL
if there is no match
+| b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all
substrings in *string* that matches the *regexp*. Returns an empty array if
there is no match
| m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [,
matchType]]]) | Replaces all substrings of *string* that match *regexp* with
*rep* at the starting *pos* in expr (if omitted, the default is 1),
*occurrence* means which occurrence of a match to search for (if omitted, the
default is 1), *matchType* specifies how to perform matching
| b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for
REGEXP_EXTRACT
| b m p | REPEAT(string, integer) | Returns a string
consisting of *string* repeated of *integer* times; returns an empty string if
*integer* is less than 1
diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
index 872114fdb7..e5ff71d424 100644
--- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
+++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
@@ -4675,6 +4675,29 @@ public class SqlOperatorTest {
f.checkQuery("select regexp_extract('a9cadca5c4aecghi', 'a[0-9]c', 1, 3)");
}
+ @Test void testRegexpExtractAllFunc() {
+ final SqlOperatorFixture f =
+
fixture().setFor(SqlLibraryOperators.REGEXP_EXTRACT_ALL).withLibrary(SqlLibrary.BIG_QUERY);
+
+ f.checkScalar("regexp_extract_all('a9cadca5c4aecghi', 'a[0-9]c')", "[a9c,
a5c]", "CHAR(16) "
+ + "NOT NULL ARRAY NOT NULL");
+ f.checkScalar("regexp_extract_all('abcde', '.')", "[a, b, c, d, e]",
"CHAR(5) NOT NULL ARRAY "
+ + "NOT NULL");
+ f.checkScalar("regexp_extract_all('abcadcabcaecghi', 'ac')", "[]",
"CHAR(15) NOT NULL ARRAY "
+ + "NOT NULL");
+ f.checkScalar("regexp_extract_all('[email protected], [email protected],
[email protected]', "
+ + "'@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+')", "[@bar.com, @gmail.com,
@outlook.com]", "CHAR(43) "
+ + "NOT NULL ARRAY NOT NULL");
+
+ f.checkNull("regexp_extract_all('abc def ghi', cast(null as varchar))");
+ f.checkNull("regexp_extract_all(cast(null as varchar), 'abc')");
+ f.checkNull("regexp_extract_all(cast(null as varchar), cast(null as
varchar))");
+
+ f.checkQuery("select regexp_extract_all('abc def ghi', 'abc')");
+ f.checkQuery("select regexp_extract_all('[email protected]',
'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')");
+ f.checkQuery("select regexp_extract_all('55as56664as422', '\\d{10}')");
+ }
+
@Test void testRegexpReplaceFunc() {
final SqlOperatorFixture f0 = fixture();
final Consumer<SqlOperatorFixture> consumer = f -> {