This is an automated email from the ASF dual-hosted git repository.

tanner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git


The following commit(s) were added to refs/heads/main by this push:
     new 9e3ea96504 [CALCITE-5978] Add REGEXP_INSTR function (enabled in 
BigQuery library)
9e3ea96504 is described below

commit 9e3ea965044e477416189fa07423433aa51b0ae1
Author: anthrino <[email protected]>
AuthorDate: Tue Sep 5 16:34:24 2023 -0700

    [CALCITE-5978] Add REGEXP_INSTR function (enabled in BigQuery library)
---
 babel/src/test/resources/sql/big-query.iq          | 155 +++++++++++++++++++++
 .../calcite/adapter/enumerable/RexImpTable.java    |   4 +
 .../org/apache/calcite/runtime/SqlFunctions.java   | 120 +++++++++++++---
 .../calcite/sql/fun/SqlLibraryOperators.java       |   8 ++
 .../org/apache/calcite/sql/type/OperandTypes.java  |   6 +
 .../org/apache/calcite/util/BuiltInMethod.java     |   8 ++
 .../org/apache/calcite/test/SqlFunctionsTest.java  |  77 ++++++++++
 site/_docs/reference.md                            |   1 +
 .../org/apache/calcite/test/SqlOperatorTest.java   |  31 +++++
 9 files changed, 388 insertions(+), 22 deletions(-)

diff --git a/babel/src/test/resources/sql/big-query.iq 
b/babel/src/test/resources/sql/big-query.iq
index 2479eb3663..5455de9f57 100755
--- a/babel/src/test/resources/sql/big-query.iq
+++ b/babel/src/test/resources/sql/big-query.iq
@@ -1244,6 +1244,161 @@ SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi", 
"(a.c).(.*)$");
 Multiple capturing groups (count=2) not allowed in regex input for 
REGEXP_EXTRACT_ALL
 !error
 
+#####################################################################
+# REGEXP_INSTR(value, regexp[, position[, occurrence[, occurrence_position]]])
+#
+# Returns the lowest 1-based position of regexp in value.
+# Returns 0 if there is no match, regex is empty or if position or occurrence 
are beyond range.
+# Returns an exception if regex, position, occurrence or occurrence_position 
are invalid.
+
+WITH example AS (
+  SELECT 'ab@cd-ef' AS source_value, '@[^-]*' AS regexp UNION ALL
+  SELECT 'ab@d-ef', '@[^-]*' UNION ALL
+  SELECT 'abc@cd-ef', '@[^-]*' UNION ALL
+  SELECT 'abc-ef', '@[^-]*')
+SELECT source_value, regexp, REGEXP_INSTR(source_value, regexp) AS instr
+FROM example;
++--------------+--------+-------+
+| source_value | regexp | instr |
++--------------+--------+-------+
+| ab@cd-ef     | @[^-]* |     3 |
+| ab@d-ef      | @[^-]* |     3 |
+| abc-ef       | @[^-]* |     0 |
+| abc@cd-ef    | @[^-]* |     4 |
++--------------+--------+-------+
+(4 rows)
+
+!ok
+
+WITH example AS (
+  SELECT 'a@cd-ef b@cd-ef' AS source_value, '@[^-]*' AS regexp, 1 AS position 
UNION ALL
+  SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 2 UNION ALL
+  SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 3 UNION ALL
+  SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 4)
+SELECT
+  source_value, regexp, position,
+  REGEXP_INSTR(source_value, regexp, position) AS instr
+FROM example;
++-----------------+--------+----------+-------+
+| source_value    | regexp | position | instr |
++-----------------+--------+----------+-------+
+| a@cd-ef b@cd-ef | @[^-]* |        1 |     2 |
+| a@cd-ef b@cd-ef | @[^-]* |        2 |     2 |
+| a@cd-ef b@cd-ef | @[^-]* |        3 |    10 |
+| a@cd-ef b@cd-ef | @[^-]* |        4 |    10 |
++-----------------+--------+----------+-------+
+(4 rows)
+
+!ok
+
+WITH example AS (
+  SELECT 'a@cd-ef b@cd-ef c@cd-ef' AS source_value,
+         '@[^-]*' AS regexp, 1 AS position, 1 AS occurrence UNION ALL
+  SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 2 UNION ALL
+  SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 3)
+SELECT
+  source_value, regexp, position, occurrence,
+  REGEXP_INSTR(source_value, regexp, position, occurrence) AS instr
+FROM example;
++-------------------------+--------+----------+------------+-------+
+| source_value            | regexp | position | occurrence | instr |
++-------------------------+--------+----------+------------+-------+
+| a@cd-ef b@cd-ef c@cd-ef | @[^-]* |        1 |          1 |     2 |
+| a@cd-ef b@cd-ef c@cd-ef | @[^-]* |        1 |          2 |    10 |
+| a@cd-ef b@cd-ef c@cd-ef | @[^-]* |        1 |          3 |    18 |
++-------------------------+--------+----------+------------+-------+
+(3 rows)
+
+!ok
+
+WITH example AS (
+  SELECT 'a@cd-ef' AS source_value, '@[^-]*' AS regexp,
+         1 AS position, 1 AS occurrence, 0 AS o_position UNION ALL
+  SELECT 'a@cd-ef', '@[^-]*', 1, 1, 1)
+SELECT
+  source_value, regexp, position, occurrence, o_position,
+  REGEXP_INSTR(source_value, regexp, position, occurrence, o_position) AS instr
+FROM example;
++--------------+--------+----------+------------+------------+-------+
+| source_value | regexp | position | occurrence | o_position | instr |
++--------------+--------+----------+------------+------------+-------+
+| a@cd-ef      | @[^-]* |        1 |          1 |          0 |     2 |
+| a@cd-ef      | @[^-]* |        1 |          1 |          1 |     5 |
++--------------+--------+----------+------------+------------+-------+
+(2 rows)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.+c");
++--------+
+| EXPR$0 |
++--------+
+|      1 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "abc(a.c)", 4);
++--------+
+| EXPR$0 |
++--------+
+|     10 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 25);
++--------+
+| EXPR$0 |
++--------+
+|      0 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 1, 5);
++--------+
+| EXPR$0 |
++--------+
+|      0 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("a9cadca5c4aecghi", "a[0-9]c", 1, 2, 1);
++--------+
+| EXPR$0 |
++--------+
+|     10 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abc def ghi", "adz)");
+Invalid regular expression for REGEXP_INSTR: 'Unmatched closing ')' near index 
2 adz)   ^'
+!error
+
+SELECT REGEXP_INSTR("abc def ghi", "(^)a(.*)b($)");
+Multiple capturing groups (count=3) not allowed in regex input for REGEXP_INSTR
+!error
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 0);
+Invalid integer input '0' for argument 'position' in REGEXP_INSTR
+!error
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, -2);
+Invalid integer input '-2' for argument 'occurrence' in REGEXP_INSTR
+!error
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, 2, -2);
+Invalid integer input '-2' for argument 'occurrence_position' in REGEXP_INSTR
+!error
+
 #####################################################################
 # REGEXP_SUBSTR(value, regexp[, position[, occurrence]])
 #
diff --git 
a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java 
b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index 03d7a07c0f..de84ea7d71 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -224,6 +224,7 @@ import static 
org.apache.calcite.sql.fun.SqlLibraryOperators.POW;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT;
 import static 
org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
@@ -582,6 +583,9 @@ public class RexImpTable {
       defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
           BuiltInMethod.REGEXP_EXTRACT3.method, 
BuiltInMethod.REGEXP_EXTRACT4.method);
       defineReflective(REGEXP_EXTRACT_ALL, 
BuiltInMethod.REGEXP_EXTRACT_ALL.method);
+      defineReflective(REGEXP_INSTR, BuiltInMethod.REGEXP_INSTR2.method,
+          BuiltInMethod.REGEXP_INSTR3.method, 
BuiltInMethod.REGEXP_INSTR4.method,
+          BuiltInMethod.REGEXP_INSTR5.method);
 
       map.put(TRIM, new TrimImplementor());
 
diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java 
b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
index 835eb068b4..1b659d6cb3 100644
--- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
+++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
@@ -365,7 +365,8 @@ public class SqlFunctions {
     return DigestUtils.sha512Hex(string.getBytes());
   }
 
-  /** State for {@code REGEXP_CONTAINS}, {@code REGEXP_REPLACE}, {@code RLIKE}.
+  /** State for {@code REGEXP_CONTAINS}, {@code REGEXP_EXTRACT}, {@code 
REGEXP_EXTRACT_ALL},
+   * {@code REGEXP_INSTR}, {@code REGEXP_REPLACE}, {@code RLIKE}.
    *
    * <p>Marked deterministic so that the code generator instantiates one once
    * per query, not once per row. */
@@ -404,7 +405,7 @@ public class SqlFunctions {
       }
     }
 
-    /** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns. 
*/
+    /** Helper for multiple capturing group regex check in REGEXP_* fns. */
     private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String 
methodName) {
       if (matcher.groupCount() > 1) {
         throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
@@ -412,31 +413,54 @@ public class SqlFunctions {
       }
     }
 
+    /** Helper for checking values of position and occurrence arguments in 
REGEXP_* fns.
+     * Regex fns not using occurrencePosition param pass a default value of 0.
+     * Throws an exception or returns true in case of failed value checks. */
+    private boolean checkPosOccurrenceParamValues(int position,
+        int occurrence, int occurrencePosition, String value, String 
methodName) {
+      if (position <= 0) {
+        throw 
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
+            "position", methodName).ex();
+      }
+      if (occurrence <= 0) {
+        throw 
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence),
+            "occurrence", methodName).ex();
+      }
+      if (occurrencePosition != 0 && occurrencePosition != 1) {
+        throw 
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrencePosition),
+            "occurrence_position", methodName).ex();
+      }
+      if (position <= value.length()) {
+        return false;
+      }
+      return true;
+    }
+
     /** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
-     * Throws a runtime exception for invalid regular expressions.*/
+     * Throws a runtime exception for invalid regular expressions. */
     public boolean regexpContains(String value, String regex) {
       final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS");
       return pattern.matcher(value).find();
     }
 
     /** SQL {@code REGEXP_EXTRACT(value, regexp)} function.
-     *  Returns NULL if there is no match. Returns an exception if regex is 
invalid.
-     *  Uses position=1 and occurrence=1 as default values when not specified. 
*/
+     * Returns NULL if there is no match. Returns an exception if regex is 
invalid.
+     * Uses position=1 and occurrence=1 as default values when not specified. 
*/
     public @Nullable String regexpExtract(String value, String regex) {
       return regexpExtract(value, regex, 1, 1);
     }
 
     /** SQL {@code REGEXP_EXTRACT(value, regexp, position)} function.
-     *  Returns NULL if there is no match, or if position is beyond range.
-     *  Returns an exception if regex or position is invalid.
-     *  Uses occurrence=1 as default value when not specified. */
+     * Returns NULL if there is no match, or if position is beyond range.
+     * Returns an exception if regex or position is invalid.
+     * Uses occurrence=1 as default value when not specified. */
     public @Nullable String regexpExtract(String value, String regex, int 
position) {
       return regexpExtract(value, regex, position, 1);
     }
 
     /** SQL {@code REGEXP_EXTRACT(value, regexp, position, occurrence)} 
function.
-     *  Returns NULL if there is no match, or if position or occurrence are 
beyond range.
-     *  Returns an exception if regex, position or occurrence are invalid. */
+     * Returns NULL if there is no match, or if position or occurrence are 
beyond range.
+     * Returns an exception if regex, position or occurrence are invalid. */
     public @Nullable String regexpExtract(String value, String regex, int 
position,
         int occurrence) {
       // Uses java.util.regex as a standard for regex processing
@@ -444,16 +468,7 @@ public class SqlFunctions {
       final String methodName = "REGEXP_EXTRACT";
       final Pattern pattern = validateRegexPattern(regex, methodName);
 
-      if (position <= 0) {
-        throw 
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
-            "position", methodName).ex();
-      }
-      if (occurrence <= 0) {
-        throw 
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence),
-            "occurrence", methodName).ex();
-      }
-
-      if (position > value.length()) {
+      if (checkPosOccurrenceParamValues(position, occurrence, 0, value, 
methodName)) {
         return null;
       }
 
@@ -475,7 +490,7 @@ public class SqlFunctions {
     }
 
     /** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function.
-     *  Returns an empty array if there is no match, returns an exception if 
regex is invalid.*/
+     * Returns an empty array if there is no match, returns an exception if 
regex is invalid. */
     public List<String> regexpExtractAll(String value, String regex) {
       // Uses java.util.regex as a standard for regex processing
       // in Calcite instead of RE2 used by BigQuery/GoogleSQL
@@ -488,13 +503,74 @@ public class SqlFunctions {
       ImmutableList.Builder<String> matches = ImmutableList.builder();
       while (matcher.find()) {
         String match = matcher.group(matcher.groupCount());
-        if (match != null && !match.isEmpty()) {
+        if (match != null) {
           matches.add(match);
         }
       }
       return matches.build();
     }
 
+    /** SQL {@code REGEXP_INSTR(value, regexp)} function.
+     * Returns 0 if there is no match or regex is empty. Returns an exception 
if regex is invalid.
+     * Uses position=1, occurrence=1, occurrencePosition=0 as default values 
if not specified. */
+    public int regexpInstr(String value, String regex) {
+      return regexpInstr(value, regex, 1, 1, 0);
+    }
+
+    /** SQL {@code REGEXP_INSTR(value, regexp, position)} function.
+     * Returns 0 if there is no match, regex is empty, or if position is 
beyond range.
+     * Returns an exception if regex or position is invalid.
+     * Uses occurrence=1, occurrencePosition=0 as default value when not 
specified. */
+    public int regexpInstr(String value, String regex, int position) {
+      return regexpInstr(value, regex, position, 1, 0);
+    }
+
+    /** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence)} function.
+     * Returns 0 if there is no match, regex is empty, or if position or 
occurrence
+     * are beyond range. Returns an exception if regex, position or occurrence 
are invalid.
+     * Uses occurrencePosition=0 as default value when not specified. */
+    public int regexpInstr(String value, String regex, int position,
+        int occurrence) {
+      return regexpInstr(value, regex, position, occurrence, 0);
+    }
+
+    /** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence, 
occurrencePosition)}
+     * function. Returns 0 if there is no match, regex is empty, or if 
position or occurrence
+     * are beyond range. Returns an exception if regex, position, occurrence
+     * or occurrencePosition are invalid. */
+    public int regexpInstr(String value, String regex, int position,
+        int occurrence, int occurrencePosition) {
+      // Uses java.util.regex as a standard for regex processing
+      // in Calcite instead of RE2 used by BigQuery/GoogleSQL
+      final String methodName = "REGEXP_INSTR";
+      final Pattern pattern = validateRegexPattern(regex, methodName);
+
+      if (checkPosOccurrenceParamValues(position, occurrence, 
occurrencePosition, value,
+          methodName) || regex.isEmpty()) {
+        return 0;
+      }
+
+      Matcher matcher = pattern.matcher(value);
+      checkMultipleCapturingGroupsInRegex(matcher, methodName);
+      matcher.region(position - 1, value.length());
+
+      int matchIndex = 0;
+      while (occurrence > 0) {
+        if (matcher.find()) {
+          if (occurrencePosition == 0) {
+            matchIndex = matcher.start(matcher.groupCount()) + 1;
+          } else {
+            matchIndex = matcher.end(matcher.groupCount()) + 1;
+          }
+        } else {
+          return 0;
+        }
+        occurrence--;
+      }
+
+      return matchIndex;
+    }
+
     /** SQL {@code REGEXP_REPLACE} function with 3 arguments. */
     public String regexpReplace(String s, String regex,
         String replacement) {
diff --git 
a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java 
b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index 43f5f6b9b4..f850ab1cc0 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -500,6 +500,14 @@ public abstract class SqlLibraryOperators {
           OperandTypes.STRING_STRING,
           SqlFunctionCategory.STRING);
 
+  /** The "REGEXP_INSTR(value, regexp [, position[, occurrence, 
[occurrence_position]]])" function.
+   * Returns the lowest 1-based position of a regexp in value. Returns NULL if 
there is no match. */
+  @LibraryOperator(libraries = {BIG_QUERY})
+  public static final SqlBasicFunction REGEXP_INSTR =
+      SqlBasicFunction.create("REGEXP_INSTR", ReturnTypes.INTEGER_NULLABLE,
+          
OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER,
+          SqlFunctionCategory.STRING);
+
   @LibraryOperator(libraries = {MYSQL, ORACLE})
   public static final SqlFunction REGEXP_REPLACE = new 
SqlRegexpReplaceFunction();
 
diff --git a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java 
b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
index 8f77ddedd9..8f1d6facdc 100644
--- a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
+++ b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
@@ -867,6 +867,12 @@ public abstract class OperandTypes {
           ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING, 
SqlTypeFamily.INTEGER,
               SqlTypeFamily.INTEGER), i -> i == 2 || i == 3);
 
+  public static final SqlSingleOperandTypeChecker
+      STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER =
+      family(
+          ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING, 
SqlTypeFamily.INTEGER,
+              SqlTypeFamily.INTEGER, SqlTypeFamily.INTEGER), i -> i == 2 || i 
== 3 || i == 4);
+
   public static final SqlSingleOperandTypeChecker STRING_INTEGER =
       family(SqlTypeFamily.STRING, SqlTypeFamily.INTEGER);
 
diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java 
b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
index 19c5228e09..4652ca6bab 100644
--- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
+++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
@@ -534,6 +534,14 @@ public enum BuiltInMethod {
       String.class, String.class, int.class, int.class),
   REGEXP_EXTRACT_ALL(SqlFunctions.RegexFunction.class, "regexpExtractAll",
       String.class, String.class),
+  REGEXP_INSTR2(SqlFunctions.RegexFunction.class, "regexpInstr",
+      String.class, String.class),
+  REGEXP_INSTR3(SqlFunctions.RegexFunction.class, "regexpInstr",
+      String.class, String.class, int.class),
+  REGEXP_INSTR4(SqlFunctions.RegexFunction.class, "regexpInstr",
+      String.class, String.class, int.class, int.class),
+  REGEXP_INSTR5(SqlFunctions.RegexFunction.class, "regexpInstr",
+      String.class, String.class, int.class, int.class, int.class),
   REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace",
       String.class, String.class, String.class),
   REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace",
diff --git a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java 
b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
index 52a33b0e8f..f31a182979 100644
--- a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
+++ b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
@@ -240,6 +240,7 @@ class SqlFunctionsTest {
     assertThat(f.regexpContains("abcdef", "abz*"), is(true));
     assertThat(f.regexpContains("zabzz", "abz*"), is(true));
     assertThat(f.regexpContains("zazbbzz", "abz*"), is(false));
+    assertThat(f.regexpContains("abcadcabcaecghi", ""), is(true));
 
     try {
       final boolean b = f.regexpContains("abc def ghi", "(abc");
@@ -274,6 +275,7 @@ class SqlFunctionsTest {
 
     // basic extracts
     assertThat(f.regexpExtract("abcadcabcaecghi", "ac"), nullValue());
+    assertThat(f.regexpExtract("abcadcabcaecghi", ""), is(""));
     assertThat(f.regexpExtract("a9cadca5c4aecghi", "a[0-9]c"), is("a9c"));
     assertThat(f.regexpExtract("abcadcabcaecghi", "a.*c"), is("abcadcabcaec"));
 
@@ -342,6 +344,7 @@ class SqlFunctionsTest {
     final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
 
     assertThat(f.regexpExtractAll("abcadcabcaecghi", "ac"), is(list()));
+    assertThat(f.regexpExtractAll("abcadc", ""), is(list("", "", "", "", "", 
"", "")));
     assertThat(f.regexpExtractAll("abcadcabcaecghi", "abc(a.c)"), 
is(list("adc", "aec")));
     assertThat(f.regexpExtractAll("abcadcabcaecghi", "a.c"), is(list("abc", 
"adc", "abc", "aec")));
     assertThat(f.regexpExtractAll("banana", "ana"), is(list("ana")));
@@ -367,6 +370,80 @@ class SqlFunctionsTest {
     }
   }
 
+  @Test void testRegexpInstr() {
+    final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
+
+    // basic searches
+    assertThat(f.regexpInstr("abcdefghij", "adc"), is(0));
+    assertThat(f.regexpInstr("abcdefghij", ""), is(0));
+    assertThat(f.regexpInstr("a9ca5c4aechi", "a[0-9]c"), is(1));
+    assertThat(f.regexpInstr("abcadcabcaecghi", ".dc"), is(4));
+
+    // capturing group searches
+    assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)"), is(4));
+    assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 4), is(10));
+    assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 1, 2), is(10));
+    assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 1, 2, 1), is(13));
+
+    // position-based searches
+    assertThat(f.regexpInstr("abcadcabcaecghi", ".ec", 25), is(0));
+    assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 4), is(7));
+    assertThat(f.regexpInstr("abcadcabcaecghi", "a.*c", 7), is(7));
+
+    // occurrence-based searches
+    assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 3), is(0));
+    assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 2, 1), is(7));
+    assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 1), is(1));
+
+    // occurrence_position-based searches
+    assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 1, 0), is(1));
+    assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 7, 1, 1), is(13));
+    assertThat(f.regexpInstr("abcadcabcaec", "abc(a.c)", 4, 1, 1), is(13));
+
+    // exceptional scenarios
+    try {
+      final int idx = f.regexpInstr("abc def ghi", "{4,1}");
+      fail("expected error, got " + idx);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Invalid regular expression for REGEXP_INSTR: 'Illegal repetition 
range near index 4"
+              + " {4,1}     ^'"));
+    }
+
+    try {
+      final int idx = f.regexpInstr("abcadcabcaecghi", "(.)a(.c)");
+      fail("expected error, got " + idx);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Multiple capturing groups (count=2) not allowed in regex input 
for "
+              + "REGEXP_INSTR"));
+    }
+
+    try {
+      final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 0);
+      fail("expected error, got " + idx);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Invalid integer input '0' for argument 'position' in 
REGEXP_INSTR"));
+    }
+
+    try {
+      final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 3, -1);
+      fail("expected error, got " + idx);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Invalid integer input '-1' for argument 'occurrence' in 
REGEXP_INSTR"));
+    }
+
+    try {
+      final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 2, 4, -4);
+      fail("expected error, got " + idx);
+    } catch (RuntimeException e) {
+      assertThat(e.getMessage(),
+          is("Invalid integer input '-4' for argument 'occurrence_position' in 
REGEXP_INSTR"));
+    }
+  }
+
   @Test void testRegexpReplace() {
     final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
     assertThat(f.regexpReplace("a b c", "b", "X"), is("a X c"));
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 6c0b9c2a63..857445d27f 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -2790,6 +2790,7 @@ BigQuery's type system uses confusingly different names 
for types and functions:
 | b | REGEXP_CONTAINS(string, regexp)                | Returns whether 
*string* is a partial match for the *regexp*
 | b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the 
substring in *string* that matches the *regexp*, starting search at *position* 
(default 1), and until locating the nth *occurrence* (default 1). Returns NULL 
if there is no match
 | b | REGEXP_EXTRACT_ALL(string, regexp)             | Returns an array of all 
substrings in *string* that matches the *regexp*. Returns an empty array if 
there is no match
+| b | REGEXP_INSTR(string, regexp [, position [, occurrence [, 
occurrence_position]]]) | Returns the lowest 1-based position of the substring 
in *string* that matches the *regexp*, starting search at *position* (default 
1), and until locating the nth *occurrence* (default 1). Setting 
occurrence_position (default 0) to 1 returns the end position of substring + 1. 
Returns 0 if there is no match
 | m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, 
matchType]]]) | Replaces all substrings of *string* that match *regexp* with 
*rep* at the starting *pos* in expr (if omitted, the default is 1), 
*occurrence* means which occurrence of a match to search for (if omitted, the 
default is 1), *matchType* specifies how to perform matching
 | b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for 
REGEXP_EXTRACT
 | b m p | REPEAT(string, integer)                    | Returns a string 
consisting of *string* repeated of *integer* times; returns an empty string if 
*integer* is less than 1
diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java 
b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
index 4e0636be07..e4b774ac59 100644
--- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
+++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
@@ -4666,6 +4666,7 @@ public class SqlOperatorTest {
     f.checkBoolean("regexp_contains('11555666442233', '^\\d{10}$')", false);
     f.checkBoolean("regexp_contains('55566644221133', '\\d{10}')", true);
     f.checkBoolean("regexp_contains('55as56664as422', '\\d{10}')", false);
+    f.checkBoolean("regexp_contains('55as56664as422', '')", true);
 
     f.checkQuery("select regexp_contains('abc def ghi', 'abc')");
     f.checkQuery("select regexp_contains('[email protected]', 
'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')");
@@ -4729,6 +4730,36 @@ public class SqlOperatorTest {
     f.checkQuery("select regexp_extract_all('55as56664as422', '\\d{10}')");
   }
 
+  @Test void testRegexpInstrFunc() {
+    final SqlOperatorFixture f =
+        
fixture().setFor(SqlLibraryOperators.REGEXP_INSTR).withLibrary(SqlLibrary.BIG_QUERY);
+
+    f.checkScalar("regexp_instr('abc def ghi', 'def')", 5, "INTEGER NOT NULL");
+    f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 2)", 4, "INTEGER NOT 
NULL");
+    f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 1, 3)", 7, "INTEGER NOT 
NULL");
+    f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 1, 3, 1)", 10, "INTEGER 
NOT NULL");
+    f.checkScalar("regexp_instr('a9cadca513ca4cecghi', 'a([0-9]+)', 1, 2, 1)", 
11,
+        "INTEGER NOT NULL");
+    f.checkScalar("regexp_instr('a9cadca513ca4cecghi', 'a([0-9]*)', 8, 1, 0)", 
13,
+        "INTEGER NOT NULL");
+    f.checkScalar("regexp_instr('55as56664as422', '\\d{3}', 3, 2, 0)", 12, 
"INTEGER NOT NULL");
+    f.checkScalar("regexp_instr('55as56664as422', '\\d{2}', 2, 2, 1)", 9, 
"INTEGER NOT NULL");
+    f.checkScalar("regexp_instr('55as56664as422', '', 2, 2, 1)", 0, "INTEGER 
NOT NULL");
+
+    f.checkNull("regexp_instr('abc def ghi', cast(null as varchar))");
+    f.checkNull("regexp_instr(cast(null as varchar), 'abc')");
+    f.checkNull("regexp_instr(cast(null as varchar), cast(null as varchar))");
+    f.checkNull("regexp_instr('abc def ghi', 'abc', cast(null as integer))");
+    f.checkNull("regexp_instr('abc def ghi', 'abc', 1, cast(null as 
integer))");
+    f.checkNull("regexp_instr('abc def ghi', 'abc', 1, 3, cast(null as 
integer))");
+
+    f.checkQuery("select regexp_instr('abc def ghi', 'abc')");
+    f.checkQuery("select regexp_instr('[email protected]', 
'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')");
+    f.checkQuery("select regexp_instr('55as56664as422', '\\d{10}')");
+    f.checkQuery("select regexp_instr('abcadcabcaecghi', 'c(a.c)', 4)");
+    f.checkQuery("select regexp_instr('a9cadca5c4aecghi', 'a[0-9]c', 1, 3)");
+  }
+
   @Test void testRegexpReplaceFunc() {
     final SqlOperatorFixture f0 = fixture();
     final Consumer<SqlOperatorFixture> consumer = f -> {

Reply via email to