This is an automated email from the ASF dual-hosted git repository. tanner pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push: new ada4fe2715 [CALCITE-5911] Add REGEXP_EXTRACT_ALL function (enabled in BigQuery library) ada4fe2715 is described below commit ada4fe2715aac8e30751cbcfb84e0b60d89db4ee Author: Jerin John <jerinj...@google.com> AuthorDate: Tue Jul 25 15:29:31 2023 -0700 [CALCITE-5911] Add REGEXP_EXTRACT_ALL function (enabled in BigQuery library) --- babel/src/test/resources/sql/big-query.iq | 59 ++++++++++++++++++++ .../calcite/adapter/enumerable/RexImpTable.java | 2 + .../org/apache/calcite/runtime/SqlFunctions.java | 65 ++++++++++++++-------- .../calcite/sql/fun/SqlLibraryOperators.java | 9 +++ .../org/apache/calcite/util/BuiltInMethod.java | 2 + .../org/apache/calcite/test/SqlFunctionsTest.java | 29 ++++++++++ site/_docs/reference.md | 1 + .../org/apache/calcite/test/SqlOperatorTest.java | 23 ++++++++ 8 files changed, 167 insertions(+), 23 deletions(-) diff --git a/babel/src/test/resources/sql/big-query.iq b/babel/src/test/resources/sql/big-query.iq index 353b7c518c..f38032ae32 100755 --- a/babel/src/test/resources/sql/big-query.iq +++ b/babel/src/test/resources/sql/big-query.iq @@ -1114,6 +1114,65 @@ SELECT REGEXP_EXTRACT("abcadcabcaecghi", "a.c", 3, 0); Invalid integer input '0' for argument 'occurrence' in REGEXP_EXTRACT !error +##################################################################### +# REGEXP_EXTRACT_ALL(value, regexp) +# +# Returns an array of all substrings in value that matches the regexp. +# Returns an empty array if there is no match. +# Returns an exception if regex is invalid or has more than one capturing group. + +WITH code_markdown AS + (SELECT 'Try `function(x)` or `function(y)`' as code) +SELECT + REGEXP_EXTRACT_ALL(code, '`(.+?)`') AS example +FROM code_markdown; ++----------------------------+ +| example | ++----------------------------+ +| [function(x), function(y)] | ++----------------------------+ +(1 row) + +!ok + +SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "abc(a.c)"); ++------------+ +| EXPR$0 | ++------------+ +| [adc, aec] | ++------------+ +(1 row) + +!ok + +SELECT REGEXP_EXTRACT_ALL("abacadaeafa", "a.a"); ++-----------------+ +| EXPR$0 | ++-----------------+ +| [aba, ada, afa] | ++-----------------+ +(1 row) + +!ok + +SELECT REGEXP_EXTRACT_ALL("12345abc123", "a[0-9]"); ++--------+ +| EXPR$0 | ++--------+ +| [] | ++--------+ +(1 row) + +!ok + +SELECT REGEXP_EXTRACT_ALL("abc def ghi", "{4,1}"); +Invalid regular expression for REGEXP_EXTRACT_ALL: 'Illegal repetition range near index 4 {4,1} ^' +!error + +SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "(a.c).(.*)$"); +Multiple capturing groups (count=2) not allowed in regex input for REGEXP_EXTRACT_ALL +!error + ##################################################################### # REGEXP_SUBSTR(value, regexp[, position[, occurrence]]) # diff --git a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java index b9aac10a35..b4a5cb8ed9 100644 --- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java +++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java @@ -220,6 +220,7 @@ import static org.apache.calcite.sql.fun.SqlLibraryOperators.PARSE_URL; import static org.apache.calcite.sql.fun.SqlLibraryOperators.POW; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT; +import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT; import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE; @@ -573,6 +574,7 @@ public class RexImpTable { defineReflective(REGEXP_CONTAINS, BuiltInMethod.REGEXP_CONTAINS.method); defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method, BuiltInMethod.REGEXP_EXTRACT3.method, BuiltInMethod.REGEXP_EXTRACT4.method); + defineReflective(REGEXP_EXTRACT_ALL, BuiltInMethod.REGEXP_EXTRACT_ALL.method); map.put(TRIM, new TrimImplementor()); diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java index 0afe39748c..c64b70ad18 100644 --- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java +++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java @@ -388,22 +388,34 @@ public class SqlFunctions { .maximumSize(FUNCTION_LEVEL_CACHE_MAX_SIZE.value()) .build(CacheLoader.from(Key::toPattern)); - /** SQL {@code REGEXP_CONTAINS(value, regexp)} function. - * Throws a runtime exception for invalid regular expressions.*/ - public boolean regexpContains(String value, String regex) { - final Pattern pattern; + /** Helper for regex validation in REGEXP_* fns. */ + private Pattern validateRegexPattern(String regex, String methodName) { try { // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL - pattern = cache.getUnchecked(new Key(0, regex)); + return cache.getUnchecked(new Key(0, regex)); } catch (UncheckedExecutionException e) { if (e.getCause() instanceof PatternSyntaxException) { throw RESOURCE.invalidRegexInputForRegexpFunctions( requireNonNull(e.getCause().getMessage(), "message") - .replace(System.lineSeparator(), " "), "REGEXP_CONTAINS").ex(); + .replace(System.lineSeparator(), " "), methodName).ex(); } throw e; } + } + + /** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns. */ + private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String methodName) { + if (matcher.groupCount() > 1) { + throw RESOURCE.multipleCapturingGroupsForRegexpExtract( + Integer.toString(matcher.groupCount()), methodName).ex(); + } + } + + /** SQL {@code REGEXP_CONTAINS(value, regexp)} function. + * Throws a runtime exception for invalid regular expressions.*/ + public boolean regexpContains(String value, String regex) { + final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS"); return pattern.matcher(value).find(); } @@ -429,18 +441,8 @@ public class SqlFunctions { int occurrence) { // Uses java.util.regex as a standard for regex processing // in Calcite instead of RE2 used by BigQuery/GoogleSQL - final Pattern pattern; - String methodName = "REGEXP_EXTRACT"; - try { - pattern = cache.getUnchecked(new Key(0, regex)); - } catch (UncheckedExecutionException e) { - if (e.getCause() instanceof PatternSyntaxException) { - throw RESOURCE.invalidRegexInputForRegexpFunctions( - requireNonNull(e.getCause().getMessage(), "message") - .replace(System.lineSeparator(), " "), methodName).ex(); - } - throw e; - } + final String methodName = "REGEXP_EXTRACT"; + final Pattern pattern = validateRegexPattern(regex, methodName); if (position <= 0) { throw RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position), @@ -456,13 +458,9 @@ public class SqlFunctions { } Matcher matcher = pattern.matcher(value); + checkMultipleCapturingGroupsInRegex(matcher, methodName); matcher.region(position - 1, value.length()); - if (matcher.groupCount() > 1) { - throw RESOURCE.multipleCapturingGroupsForRegexpExtract( - Integer.toString(matcher.groupCount()), methodName).ex(); - } - String match = null; while (occurrence > 0) { if (matcher.find()) { @@ -476,6 +474,27 @@ public class SqlFunctions { return match; } + /** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function. + * Returns an empty array if there is no match, returns an exception if regex is invalid.*/ + public List<String> regexpExtractAll(String value, String regex) { + // Uses java.util.regex as a standard for regex processing + // in Calcite instead of RE2 used by BigQuery/GoogleSQL + final String methodName = "REGEXP_EXTRACT_ALL"; + final Pattern regexp = validateRegexPattern(regex, methodName); + + Matcher matcher = regexp.matcher(value); + checkMultipleCapturingGroupsInRegex(matcher, methodName); + + ImmutableList.Builder<String> matches = ImmutableList.builder(); + while (matcher.find()) { + String match = matcher.group(matcher.groupCount()); + if (match != null && !match.isEmpty()) { + matches.add(match); + } + } + return matches.build(); + } + /** SQL {@code REGEXP_REPLACE} function with 3 arguments. */ public String regexpReplace(String s, String regex, String replacement) { diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java index eba88e93d7..21503e4b73 100644 --- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java +++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java @@ -491,6 +491,15 @@ public abstract class SqlLibraryOperators { OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER, SqlFunctionCategory.STRING); + /** The "REGEXP_EXTRACT_ALL(value, regexp)" function. + * Returns the substring in value that matches the regexp. Returns NULL if there is no match. */ + @LibraryOperator(libraries = {BIG_QUERY}) + public static final SqlBasicFunction REGEXP_EXTRACT_ALL = + SqlBasicFunction.create("REGEXP_EXTRACT_ALL", ReturnTypes.ARG0_NULLABLE + .andThen(SqlTypeTransforms.TO_ARRAY), + OperandTypes.STRING_STRING, + SqlFunctionCategory.STRING); + @LibraryOperator(libraries = {MYSQL, ORACLE}) public static final SqlFunction REGEXP_REPLACE = new SqlRegexpReplaceFunction(); diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java index cdc9fbc83f..53edb703e4 100644 --- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java +++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java @@ -529,6 +529,8 @@ public enum BuiltInMethod { String.class, String.class, int.class), REGEXP_EXTRACT4(SqlFunctions.RegexFunction.class, "regexpExtract", String.class, String.class, int.class, int.class), + REGEXP_EXTRACT_ALL(SqlFunctions.RegexFunction.class, "regexpExtractAll", + String.class, String.class), REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace", String.class, String.class, String.class), REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace", diff --git a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java index 726c047525..52a33b0e8f 100644 --- a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java +++ b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java @@ -338,6 +338,35 @@ class SqlFunctionsTest { } } + @Test void testRegexpExtractAll() { + final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction(); + + assertThat(f.regexpExtractAll("abcadcabcaecghi", "ac"), is(list())); + assertThat(f.regexpExtractAll("abcadcabcaecghi", "abc(a.c)"), is(list("adc", "aec"))); + assertThat(f.regexpExtractAll("abcadcabcaecghi", "a.c"), is(list("abc", "adc", "abc", "aec"))); + assertThat(f.regexpExtractAll("banana", "ana"), is(list("ana"))); + assertThat(f.regexpExtractAll("abacadaeafa", "a.a"), is(list("aba", "ada", "afa"))); + assertThat(f.regexpExtractAll("abcdefghijklmnop", ".+"), is(list("abcdefghijklmnop"))); + + try { + final List<String> s = f.regexpExtractAll("abc def ghi", "(abc"); + fail("expected error, got array: " + s); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Invalid regular expression for REGEXP_EXTRACT_ALL: 'Unclosed group near index 4 " + + "(abc'")); + } + + try { + final List<String> s = f.regexpExtractAll("abcadcabcaecghi", "(abc).(ax).(a.c)"); + fail("expected error, got array:" + s); + } catch (RuntimeException e) { + assertThat(e.getMessage(), + is("Multiple capturing groups (count=3) not allowed in regex input for " + + "REGEXP_EXTRACT_ALL")); + } + } + @Test void testRegexpReplace() { final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction(); assertThat(f.regexpReplace("a b c", "b", "X"), is("a X c")); diff --git a/site/_docs/reference.md b/site/_docs/reference.md index 834d4f6174..904f30df24 100644 --- a/site/_docs/reference.md +++ b/site/_docs/reference.md @@ -2786,6 +2786,7 @@ BigQuery's type system uses confusingly different names for types and functions: | b | POW(numeric1, numeric2) | Returns *numeric1* raised to the power *numeric2* | b | REGEXP_CONTAINS(string, regexp) | Returns whether *string* is a partial match for the *regexp* | b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Returns NULL if there is no match +| b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all substrings in *string* that matches the *regexp*. Returns an empty array if there is no match | m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* means which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching | b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for REGEXP_EXTRACT | b m p | REPEAT(string, integer) | Returns a string consisting of *string* repeated of *integer* times; returns an empty string if *integer* is less than 1 diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java index 872114fdb7..e5ff71d424 100644 --- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java +++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java @@ -4675,6 +4675,29 @@ public class SqlOperatorTest { f.checkQuery("select regexp_extract('a9cadca5c4aecghi', 'a[0-9]c', 1, 3)"); } + @Test void testRegexpExtractAllFunc() { + final SqlOperatorFixture f = + fixture().setFor(SqlLibraryOperators.REGEXP_EXTRACT_ALL).withLibrary(SqlLibrary.BIG_QUERY); + + f.checkScalar("regexp_extract_all('a9cadca5c4aecghi', 'a[0-9]c')", "[a9c, a5c]", "CHAR(16) " + + "NOT NULL ARRAY NOT NULL"); + f.checkScalar("regexp_extract_all('abcde', '.')", "[a, b, c, d, e]", "CHAR(5) NOT NULL ARRAY " + + "NOT NULL"); + f.checkScalar("regexp_extract_all('abcadcabcaecghi', 'ac')", "[]", "CHAR(15) NOT NULL ARRAY " + + "NOT NULL"); + f.checkScalar("regexp_extract_all('f...@bar.com, f...@gmail.com, f...@outlook.com', " + + "'@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+')", "[@bar.com, @gmail.com, @outlook.com]", "CHAR(43) " + + "NOT NULL ARRAY NOT NULL"); + + f.checkNull("regexp_extract_all('abc def ghi', cast(null as varchar))"); + f.checkNull("regexp_extract_all(cast(null as varchar), 'abc')"); + f.checkNull("regexp_extract_all(cast(null as varchar), cast(null as varchar))"); + + f.checkQuery("select regexp_extract_all('abc def ghi', 'abc')"); + f.checkQuery("select regexp_extract_all('f...@bar.com', '@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')"); + f.checkQuery("select regexp_extract_all('55as56664as422', '\\d{10}')"); + } + @Test void testRegexpReplaceFunc() { final SqlOperatorFixture f0 = fixture(); final Consumer<SqlOperatorFixture> consumer = f -> {