This is an automated email from the ASF dual-hosted git repository.

tanner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git


The following commit(s) were added to refs/heads/main by this push:
     new ada4fe2715 [CALCITE-5911] Add REGEXP_EXTRACT_ALL function (enabled in 
BigQuery library)
ada4fe2715 is described below

commit ada4fe2715aac8e30751cbcfb84e0b60d89db4ee
Author: Jerin John <jerinj...@google.com>
AuthorDate: Tue Jul 25 15:29:31 2023 -0700

    [CALCITE-5911] Add REGEXP_EXTRACT_ALL function (enabled in BigQuery library)
---
 babel/src/test/resources/sql/big-query.iq          | 59 ++++++++++++++++++++
 .../calcite/adapter/enumerable/RexImpTable.java    |  2 +
 .../org/apache/calcite/runtime/SqlFunctions.java   | 65 ++++++++++++++--------
 .../calcite/sql/fun/SqlLibraryOperators.java       |  9 +++
 .../org/apache/calcite/util/BuiltInMethod.java     |  2 +
 .../org/apache/calcite/test/SqlFunctionsTest.java  | 29 ++++++++++
 site/_docs/reference.md                            |  1 +
 .../org/apache/calcite/test/SqlOperatorTest.java   | 23 ++++++++
 8 files changed, 167 insertions(+), 23 deletions(-)

diff --git a/babel/src/test/resources/sql/big-query.iq 
b/babel/src/test/resources/sql/big-query.iq
index 353b7c518c..f38032ae32 100755
--- a/babel/src/test/resources/sql/big-query.iq
+++ b/babel/src/test/resources/sql/big-query.iq
@@ -1114,6 +1114,65 @@ SELECT REGEXP_EXTRACT("abcadcabcaecghi", "a.c", 3, 0);
 Invalid integer input '0' for argument 'occurrence' in REGEXP_EXTRACT
 !error
 
+#####################################################################
+# REGEXP_EXTRACT_ALL(value, regexp)
+#
+# Returns an array of all substrings in value that matches the regexp.
+# Returns an empty array if there is no match.
+# Returns an exception if regex is invalid or has more than one capturing 
group.
+
+WITH code_markdown AS
+  (SELECT 'Try `function(x)` or `function(y)`' as code)
+SELECT
+  REGEXP_EXTRACT_ALL(code, '`(.+?)`') AS example
+FROM code_markdown;
++----------------------------+
+| example                    |
++----------------------------+
+| [function(x), function(y)] |
++----------------------------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "abc(a.c)");
++------------+
+| EXPR$0     |
++------------+
+| [adc, aec] |
++------------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("abacadaeafa", "a.a");
++-----------------+
+| EXPR$0          |
++-----------------+
+| [aba, ada, afa] |
++-----------------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("12345abc123", "a[0-9]");
++--------+
+| EXPR$0 |
++--------+
+| []     |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_EXTRACT_ALL("abc def ghi", "{4,1}");
+Invalid regular expression for REGEXP_EXTRACT_ALL: 'Illegal repetition range 
near index 4 {4,1}     ^'
+!error
+
+SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", "(a.c).(.*)$");
+Multiple capturing groups (count=2) not allowed in regex input for 
REGEXP_EXTRACT_ALL
+!error
+
 #####################################################################
 # REGEXP_SUBSTR(value, regexp[, position[, occurrence]])
 #
diff --git 
a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java 
b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index b9aac10a35..b4a5cb8ed9 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -220,6 +220,7 @@ import static 
org.apache.calcite.sql.fun.SqlLibraryOperators.PARSE_URL;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.POW;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT;
+import static 
org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
@@ -573,6 +574,7 @@ public class RexImpTable {
       defineReflective(REGEXP_CONTAINS, BuiltInMethod.REGEXP_CONTAINS.method);
       defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
           BuiltInMethod.REGEXP_EXTRACT3.method, 
BuiltInMethod.REGEXP_EXTRACT4.method);
+      defineReflective(REGEXP_EXTRACT_ALL, 
BuiltInMethod.REGEXP_EXTRACT_ALL.method);
 
       map.put(TRIM, new TrimImplementor());
 
diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java 
b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
index 0afe39748c..c64b70ad18 100644
--- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
+++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
@@ -388,22 +388,34 @@ public class SqlFunctions {
             .maximumSize(FUNCTION_LEVEL_CACHE_MAX_SIZE.value())
             .build(CacheLoader.from(Key::toPattern));
 
-    /** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
-     * Throws a runtime exception for invalid regular expressions.*/
-    public boolean regexpContains(String value, String regex) {
-      final Pattern pattern;
+    /** Helper for regex validation in REGEXP_* fns. */
+    private Pattern validateRegexPattern(String regex, String methodName) {
       try {
         // Uses java.util.regex as a standard for regex processing
         // in Calcite instead of RE2 used by BigQuery/GoogleSQL
-        pattern = cache.getUnchecked(new Key(0, regex));
+        return cache.getUnchecked(new Key(0, regex));
       } catch (UncheckedExecutionException e) {
         if (e.getCause() instanceof PatternSyntaxException) {
           throw RESOURCE.invalidRegexInputForRegexpFunctions(
               requireNonNull(e.getCause().getMessage(), "message")
-                  .replace(System.lineSeparator(), " "), 
"REGEXP_CONTAINS").ex();
+                  .replace(System.lineSeparator(), " "), methodName).ex();
         }
         throw e;
       }
+    }
+
+    /** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns. 
*/
+    private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String 
methodName) {
+      if (matcher.groupCount() > 1) {
+        throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
+            Integer.toString(matcher.groupCount()), methodName).ex();
+      }
+    }
+
+    /** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
+     * Throws a runtime exception for invalid regular expressions.*/
+    public boolean regexpContains(String value, String regex) {
+      final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS");
       return pattern.matcher(value).find();
     }
 
@@ -429,18 +441,8 @@ public class SqlFunctions {
         int occurrence) {
       // Uses java.util.regex as a standard for regex processing
       // in Calcite instead of RE2 used by BigQuery/GoogleSQL
-      final Pattern pattern;
-      String methodName = "REGEXP_EXTRACT";
-      try {
-        pattern = cache.getUnchecked(new Key(0, regex));
-      } catch (UncheckedExecutionException e) {
-        if (e.getCause() instanceof PatternSyntaxException) {
-          throw RESOURCE.invalidRegexInputForRegexpFunctions(
-              requireNonNull(e.getCause().getMessage(), "message")
-                  .replace(System.lineSeparator(), " "), methodName).ex();
-        }
-        throw e;
-      }
+      final String methodName = "REGEXP_EXTRACT";
+      final Pattern pattern = validateRegexPattern(regex, methodName);
 
       if (position <= 0) {
         throw 
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
@@ -456,13 +458,9 @@ public class SqlFunctions {
       }
 
       Matcher matcher = pattern.matcher(value);
+      checkMultipleCapturingGroupsInRegex(matcher, methodName);
       matcher.region(position - 1, value.length());
 
-      if (matcher.groupCount() > 1) {
-        throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
-            Integer.toString(matcher.groupCount()), methodName).ex();
-      }
-
       String match = null;
       while (occurrence > 0) {
         if (matcher.find()) {
@@ -476,6 +474,27 @@ public class SqlFunctions {
       return match;
     }
 
+    /** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function.
+     *  Returns an empty array if there is no match, returns an exception if 
regex is invalid.*/
+    public List<String> regexpExtractAll(String value, String regex) {
+      // Uses java.util.regex as a standard for regex processing
+      // in Calcite instead of RE2 used by BigQuery/GoogleSQL
+      final String methodName = "REGEXP_EXTRACT_ALL";
+      final Pattern regexp = validateRegexPattern(regex, methodName);
+
+      Matcher matcher = regexp.matcher(value);
+      checkMultipleCapturingGroupsInRegex(matcher, methodName);
+
+      ImmutableList.Builder<String> matches = ImmutableList.builder();
+      while (matcher.find()) {
+        String match = matcher.group(matcher.groupCount());
+        if (match != null && !match.isEmpty()) {
+          matches.add(match);
+        }
+      }
+      return matches.build();
+    }
+
     /** SQL {@code REGEXP_REPLACE} function with 3 arguments. */
     public String regexpReplace(String s, String regex,
         String replacement) {
diff --git 
a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java 
b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index eba88e93d7..21503e4b73 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -491,6 +491,15 @@ public abstract class SqlLibraryOperators {
           OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER,
           SqlFunctionCategory.STRING);
 
+  /** The "REGEXP_EXTRACT_ALL(value, regexp)" function.
+   * Returns the substring in value that matches the regexp. Returns NULL if 
there is no match. */
+  @LibraryOperator(libraries = {BIG_QUERY})
+  public static final SqlBasicFunction REGEXP_EXTRACT_ALL =
+      SqlBasicFunction.create("REGEXP_EXTRACT_ALL", ReturnTypes.ARG0_NULLABLE
+              .andThen(SqlTypeTransforms.TO_ARRAY),
+          OperandTypes.STRING_STRING,
+          SqlFunctionCategory.STRING);
+
   @LibraryOperator(libraries = {MYSQL, ORACLE})
   public static final SqlFunction REGEXP_REPLACE = new 
SqlRegexpReplaceFunction();
 
diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java 
b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
index cdc9fbc83f..53edb703e4 100644
--- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
+++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
@@ -529,6 +529,8 @@ public enum BuiltInMethod {
       String.class, String.class, int.class),
   REGEXP_EXTRACT4(SqlFunctions.RegexFunction.class, "regexpExtract",
       String.class, String.class, int.class, int.class),
+  REGEXP_EXTRACT_ALL(SqlFunctions.RegexFunction.class, "regexpExtractAll",
+      String.class, String.class),
   REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace",
       String.class, String.class, String.class),
   REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace",
diff --git a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java 
b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
index 726c047525..52a33b0e8f 100644
--- a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
+++ b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
@@ -338,6 +338,35 @@ class SqlFunctionsTest {
     }
   }
 
+  @Test void testRegexpExtractAll() {
+    final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
+
+    assertThat(f.regexpExtractAll("abcadcabcaecghi", "ac"), is(list()));
+    assertThat(f.regexpExtractAll("abcadcabcaecghi", "abc(a.c)"), 
is(list("adc", "aec")));
+    assertThat(f.regexpExtractAll("abcadcabcaecghi", "a.c"), is(list("abc", 
"adc", "abc", "aec")));
+    assertThat(f.regexpExtractAll("banana", "ana"), is(list("ana")));
+    assertThat(f.regexpExtractAll("abacadaeafa", "a.a"), is(list("aba", "ada", 
"afa")));
+    assertThat(f.regexpExtractAll("abcdefghijklmnop", ".+"), 
is(list("abcdefghijklmnop")));
+
+    try {
+      final List<String> s = f.regexpExtractAll("abc def ghi", "(abc");
+      fail("expected error, got array: " + s);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Invalid regular expression for REGEXP_EXTRACT_ALL: 'Unclosed 
group near index 4 "
+              + "(abc'"));
+    }
+
+    try {
+      final List<String> s = f.regexpExtractAll("abcadcabcaecghi", 
"(abc).(ax).(a.c)");
+      fail("expected error, got array:" + s);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Multiple capturing groups (count=3) not allowed in regex input 
for "
+              + "REGEXP_EXTRACT_ALL"));
+    }
+  }
+
   @Test void testRegexpReplace() {
     final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
     assertThat(f.regexpReplace("a b c", "b", "X"), is("a X c"));
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 834d4f6174..904f30df24 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -2786,6 +2786,7 @@ BigQuery's type system uses confusingly different names 
for types and functions:
 | b | POW(numeric1, numeric2)                        | Returns *numeric1* 
raised to the power *numeric2*
 | b | REGEXP_CONTAINS(string, regexp)                | Returns whether 
*string* is a partial match for the *regexp*
 | b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the 
substring in *string* that matches the *regexp*, starting search at *position* 
(default 1), and until locating the nth *occurrence* (default 1). Returns NULL 
if there is no match
+| b | REGEXP_EXTRACT_ALL(string, regexp)             | Returns an array of all 
substrings in *string* that matches the *regexp*. Returns an empty array if 
there is no match
 | m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, 
matchType]]]) | Replaces all substrings of *string* that match *regexp* with 
*rep* at the starting *pos* in expr (if omitted, the default is 1), 
*occurrence* means which occurrence of a match to search for (if omitted, the 
default is 1), *matchType* specifies how to perform matching
 | b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for 
REGEXP_EXTRACT
 | b m p | REPEAT(string, integer)                    | Returns a string 
consisting of *string* repeated of *integer* times; returns an empty string if 
*integer* is less than 1
diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java 
b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
index 872114fdb7..e5ff71d424 100644
--- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
+++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
@@ -4675,6 +4675,29 @@ public class SqlOperatorTest {
     f.checkQuery("select regexp_extract('a9cadca5c4aecghi', 'a[0-9]c', 1, 3)");
   }
 
+  @Test void testRegexpExtractAllFunc() {
+    final SqlOperatorFixture f =
+        
fixture().setFor(SqlLibraryOperators.REGEXP_EXTRACT_ALL).withLibrary(SqlLibrary.BIG_QUERY);
+
+    f.checkScalar("regexp_extract_all('a9cadca5c4aecghi', 'a[0-9]c')", "[a9c, 
a5c]", "CHAR(16) "
+        + "NOT NULL ARRAY NOT NULL");
+    f.checkScalar("regexp_extract_all('abcde', '.')", "[a, b, c, d, e]", 
"CHAR(5) NOT NULL ARRAY "
+        + "NOT NULL");
+    f.checkScalar("regexp_extract_all('abcadcabcaecghi', 'ac')", "[]", 
"CHAR(15) NOT NULL ARRAY "
+        + "NOT NULL");
+    f.checkScalar("regexp_extract_all('f...@bar.com, f...@gmail.com, 
f...@outlook.com', "
+        + "'@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+')", "[@bar.com, @gmail.com, 
@outlook.com]", "CHAR(43) "
+        + "NOT NULL ARRAY NOT NULL");
+
+    f.checkNull("regexp_extract_all('abc def ghi', cast(null as varchar))");
+    f.checkNull("regexp_extract_all(cast(null as varchar), 'abc')");
+    f.checkNull("regexp_extract_all(cast(null as varchar), cast(null as 
varchar))");
+
+    f.checkQuery("select regexp_extract_all('abc def ghi', 'abc')");
+    f.checkQuery("select regexp_extract_all('f...@bar.com', 
'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')");
+    f.checkQuery("select regexp_extract_all('55as56664as422', '\\d{10}')");
+  }
+
   @Test void testRegexpReplaceFunc() {
     final SqlOperatorFixture f0 = fixture();
     final Consumer<SqlOperatorFixture> consumer = f -> {

Reply via email to