This is an automated email from the ASF dual-hosted git repository.
tanner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push:
new 9e3ea96504 [CALCITE-5978] Add REGEXP_INSTR function (enabled in
BigQuery library)
9e3ea96504 is described below
commit 9e3ea965044e477416189fa07423433aa51b0ae1
Author: anthrino <[email protected]>
AuthorDate: Tue Sep 5 16:34:24 2023 -0700
[CALCITE-5978] Add REGEXP_INSTR function (enabled in BigQuery library)
---
babel/src/test/resources/sql/big-query.iq | 155 +++++++++++++++++++++
.../calcite/adapter/enumerable/RexImpTable.java | 4 +
.../org/apache/calcite/runtime/SqlFunctions.java | 120 +++++++++++++---
.../calcite/sql/fun/SqlLibraryOperators.java | 8 ++
.../org/apache/calcite/sql/type/OperandTypes.java | 6 +
.../org/apache/calcite/util/BuiltInMethod.java | 8 ++
.../org/apache/calcite/test/SqlFunctionsTest.java | 77 ++++++++++
site/_docs/reference.md | 1 +
.../org/apache/calcite/test/SqlOperatorTest.java | 31 +++++
9 files changed, 388 insertions(+), 22 deletions(-)
diff --git a/babel/src/test/resources/sql/big-query.iq
b/babel/src/test/resources/sql/big-query.iq
index 2479eb3663..5455de9f57 100755
--- a/babel/src/test/resources/sql/big-query.iq
+++ b/babel/src/test/resources/sql/big-query.iq
@@ -1244,6 +1244,161 @@ SELECT REGEXP_EXTRACT_ALL("abcadcabcaecghi",
"(a.c).(.*)$");
Multiple capturing groups (count=2) not allowed in regex input for
REGEXP_EXTRACT_ALL
!error
+#####################################################################
+# REGEXP_INSTR(value, regexp[, position[, occurrence[, occurrence_position]]])
+#
+# Returns the lowest 1-based position of regexp in value.
+# Returns 0 if there is no match, regex is empty or if position or occurrence
are beyond range.
+# Returns an exception if regex, position, occurrence or occurrence_position
are invalid.
+
+WITH example AS (
+ SELECT 'ab@cd-ef' AS source_value, '@[^-]*' AS regexp UNION ALL
+ SELECT 'ab@d-ef', '@[^-]*' UNION ALL
+ SELECT 'abc@cd-ef', '@[^-]*' UNION ALL
+ SELECT 'abc-ef', '@[^-]*')
+SELECT source_value, regexp, REGEXP_INSTR(source_value, regexp) AS instr
+FROM example;
++--------------+--------+-------+
+| source_value | regexp | instr |
++--------------+--------+-------+
+| ab@cd-ef | @[^-]* | 3 |
+| ab@d-ef | @[^-]* | 3 |
+| abc-ef | @[^-]* | 0 |
+| abc@cd-ef | @[^-]* | 4 |
++--------------+--------+-------+
+(4 rows)
+
+!ok
+
+WITH example AS (
+ SELECT 'a@cd-ef b@cd-ef' AS source_value, '@[^-]*' AS regexp, 1 AS position
UNION ALL
+ SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 2 UNION ALL
+ SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 3 UNION ALL
+ SELECT 'a@cd-ef b@cd-ef', '@[^-]*', 4)
+SELECT
+ source_value, regexp, position,
+ REGEXP_INSTR(source_value, regexp, position) AS instr
+FROM example;
++-----------------+--------+----------+-------+
+| source_value | regexp | position | instr |
++-----------------+--------+----------+-------+
+| a@cd-ef b@cd-ef | @[^-]* | 1 | 2 |
+| a@cd-ef b@cd-ef | @[^-]* | 2 | 2 |
+| a@cd-ef b@cd-ef | @[^-]* | 3 | 10 |
+| a@cd-ef b@cd-ef | @[^-]* | 4 | 10 |
++-----------------+--------+----------+-------+
+(4 rows)
+
+!ok
+
+WITH example AS (
+ SELECT 'a@cd-ef b@cd-ef c@cd-ef' AS source_value,
+ '@[^-]*' AS regexp, 1 AS position, 1 AS occurrence UNION ALL
+ SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 2 UNION ALL
+ SELECT 'a@cd-ef b@cd-ef c@cd-ef', '@[^-]*', 1, 3)
+SELECT
+ source_value, regexp, position, occurrence,
+ REGEXP_INSTR(source_value, regexp, position, occurrence) AS instr
+FROM example;
++-------------------------+--------+----------+------------+-------+
+| source_value | regexp | position | occurrence | instr |
++-------------------------+--------+----------+------------+-------+
+| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 1 | 2 |
+| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 2 | 10 |
+| a@cd-ef b@cd-ef c@cd-ef | @[^-]* | 1 | 3 | 18 |
++-------------------------+--------+----------+------------+-------+
+(3 rows)
+
+!ok
+
+WITH example AS (
+ SELECT 'a@cd-ef' AS source_value, '@[^-]*' AS regexp,
+ 1 AS position, 1 AS occurrence, 0 AS o_position UNION ALL
+ SELECT 'a@cd-ef', '@[^-]*', 1, 1, 1)
+SELECT
+ source_value, regexp, position, occurrence, o_position,
+ REGEXP_INSTR(source_value, regexp, position, occurrence, o_position) AS instr
+FROM example;
++--------------+--------+----------+------------+------------+-------+
+| source_value | regexp | position | occurrence | o_position | instr |
++--------------+--------+----------+------------+------------+-------+
+| a@cd-ef | @[^-]* | 1 | 1 | 0 | 2 |
+| a@cd-ef | @[^-]* | 1 | 1 | 1 | 5 |
++--------------+--------+----------+------------+------------+-------+
+(2 rows)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.+c");
++--------+
+| EXPR$0 |
++--------+
+| 1 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "abc(a.c)", 4);
++--------+
+| EXPR$0 |
++--------+
+| 10 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 25);
++--------+
+| EXPR$0 |
++--------+
+| 0 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 1, 5);
++--------+
+| EXPR$0 |
++--------+
+| 0 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("a9cadca5c4aecghi", "a[0-9]c", 1, 2, 1);
++--------+
+| EXPR$0 |
++--------+
+| 10 |
++--------+
+(1 row)
+
+!ok
+
+SELECT REGEXP_INSTR("abc def ghi", "adz)");
+Invalid regular expression for REGEXP_INSTR: 'Unmatched closing ')' near index
2 adz) ^'
+!error
+
+SELECT REGEXP_INSTR("abc def ghi", "(^)a(.*)b($)");
+Multiple capturing groups (count=3) not allowed in regex input for REGEXP_INSTR
+!error
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 0);
+Invalid integer input '0' for argument 'position' in REGEXP_INSTR
+!error
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, -2);
+Invalid integer input '-2' for argument 'occurrence' in REGEXP_INSTR
+!error
+
+SELECT REGEXP_INSTR("abcadcabcaecghi", "a.c", 3, 2, -2);
+Invalid integer input '-2' for argument 'occurrence_position' in REGEXP_INSTR
+!error
+
#####################################################################
# REGEXP_SUBSTR(value, regexp[, position[, occurrence]])
#
diff --git
a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index 03d7a07c0f..de84ea7d71 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -224,6 +224,7 @@ import static
org.apache.calcite.sql.fun.SqlLibraryOperators.POW;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_CONTAINS;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT;
import static
org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_EXTRACT_ALL;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_INSTR;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REGEXP_REPLACE;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REPEAT;
import static org.apache.calcite.sql.fun.SqlLibraryOperators.REVERSE;
@@ -582,6 +583,9 @@ public class RexImpTable {
defineReflective(REGEXP_EXTRACT, BuiltInMethod.REGEXP_EXTRACT2.method,
BuiltInMethod.REGEXP_EXTRACT3.method,
BuiltInMethod.REGEXP_EXTRACT4.method);
defineReflective(REGEXP_EXTRACT_ALL,
BuiltInMethod.REGEXP_EXTRACT_ALL.method);
+ defineReflective(REGEXP_INSTR, BuiltInMethod.REGEXP_INSTR2.method,
+ BuiltInMethod.REGEXP_INSTR3.method,
BuiltInMethod.REGEXP_INSTR4.method,
+ BuiltInMethod.REGEXP_INSTR5.method);
map.put(TRIM, new TrimImplementor());
diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
index 835eb068b4..1b659d6cb3 100644
--- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
+++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
@@ -365,7 +365,8 @@ public class SqlFunctions {
return DigestUtils.sha512Hex(string.getBytes());
}
- /** State for {@code REGEXP_CONTAINS}, {@code REGEXP_REPLACE}, {@code RLIKE}.
+ /** State for {@code REGEXP_CONTAINS}, {@code REGEXP_EXTRACT}, {@code
REGEXP_EXTRACT_ALL},
+ * {@code REGEXP_INSTR}, {@code REGEXP_REPLACE}, {@code RLIKE}.
*
* <p>Marked deterministic so that the code generator instantiates one once
* per query, not once per row. */
@@ -404,7 +405,7 @@ public class SqlFunctions {
}
}
- /** Helper for multiple capturing group regex check in REGEXP_EXTRACT fns.
*/
+ /** Helper for multiple capturing group regex check in REGEXP_* fns. */
private void checkMultipleCapturingGroupsInRegex(Matcher matcher, String
methodName) {
if (matcher.groupCount() > 1) {
throw RESOURCE.multipleCapturingGroupsForRegexpExtract(
@@ -412,31 +413,54 @@ public class SqlFunctions {
}
}
+ /** Helper for checking values of position and occurrence arguments in
REGEXP_* fns.
+ * Regex fns not using occurrencePosition param pass a default value of 0.
+ * Throws an exception or returns true in case of failed value checks. */
+ private boolean checkPosOccurrenceParamValues(int position,
+ int occurrence, int occurrencePosition, String value, String
methodName) {
+ if (position <= 0) {
+ throw
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
+ "position", methodName).ex();
+ }
+ if (occurrence <= 0) {
+ throw
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence),
+ "occurrence", methodName).ex();
+ }
+ if (occurrencePosition != 0 && occurrencePosition != 1) {
+ throw
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrencePosition),
+ "occurrence_position", methodName).ex();
+ }
+ if (position <= value.length()) {
+ return false;
+ }
+ return true;
+ }
+
/** SQL {@code REGEXP_CONTAINS(value, regexp)} function.
- * Throws a runtime exception for invalid regular expressions.*/
+ * Throws a runtime exception for invalid regular expressions. */
public boolean regexpContains(String value, String regex) {
final Pattern pattern = validateRegexPattern(regex, "REGEXP_CONTAINS");
return pattern.matcher(value).find();
}
/** SQL {@code REGEXP_EXTRACT(value, regexp)} function.
- * Returns NULL if there is no match. Returns an exception if regex is
invalid.
- * Uses position=1 and occurrence=1 as default values when not specified.
*/
+ * Returns NULL if there is no match. Returns an exception if regex is
invalid.
+ * Uses position=1 and occurrence=1 as default values when not specified.
*/
public @Nullable String regexpExtract(String value, String regex) {
return regexpExtract(value, regex, 1, 1);
}
/** SQL {@code REGEXP_EXTRACT(value, regexp, position)} function.
- * Returns NULL if there is no match, or if position is beyond range.
- * Returns an exception if regex or position is invalid.
- * Uses occurrence=1 as default value when not specified. */
+ * Returns NULL if there is no match, or if position is beyond range.
+ * Returns an exception if regex or position is invalid.
+ * Uses occurrence=1 as default value when not specified. */
public @Nullable String regexpExtract(String value, String regex, int
position) {
return regexpExtract(value, regex, position, 1);
}
/** SQL {@code REGEXP_EXTRACT(value, regexp, position, occurrence)}
function.
- * Returns NULL if there is no match, or if position or occurrence are
beyond range.
- * Returns an exception if regex, position or occurrence are invalid. */
+ * Returns NULL if there is no match, or if position or occurrence are
beyond range.
+ * Returns an exception if regex, position or occurrence are invalid. */
public @Nullable String regexpExtract(String value, String regex, int
position,
int occurrence) {
// Uses java.util.regex as a standard for regex processing
@@ -444,16 +468,7 @@ public class SqlFunctions {
final String methodName = "REGEXP_EXTRACT";
final Pattern pattern = validateRegexPattern(regex, methodName);
- if (position <= 0) {
- throw
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(position),
- "position", methodName).ex();
- }
- if (occurrence <= 0) {
- throw
RESOURCE.invalidIntegerInputForRegexpFunctions(Integer.toString(occurrence),
- "occurrence", methodName).ex();
- }
-
- if (position > value.length()) {
+ if (checkPosOccurrenceParamValues(position, occurrence, 0, value,
methodName)) {
return null;
}
@@ -475,7 +490,7 @@ public class SqlFunctions {
}
/** SQL {@code REGEXP_EXTRACT_ALL(value, regexp)} function.
- * Returns an empty array if there is no match, returns an exception if
regex is invalid.*/
+ * Returns an empty array if there is no match, returns an exception if
regex is invalid. */
public List<String> regexpExtractAll(String value, String regex) {
// Uses java.util.regex as a standard for regex processing
// in Calcite instead of RE2 used by BigQuery/GoogleSQL
@@ -488,13 +503,74 @@ public class SqlFunctions {
ImmutableList.Builder<String> matches = ImmutableList.builder();
while (matcher.find()) {
String match = matcher.group(matcher.groupCount());
- if (match != null && !match.isEmpty()) {
+ if (match != null) {
matches.add(match);
}
}
return matches.build();
}
+ /** SQL {@code REGEXP_INSTR(value, regexp)} function.
+ * Returns 0 if there is no match or regex is empty. Returns an exception
if regex is invalid.
+ * Uses position=1, occurrence=1, occurrencePosition=0 as default values
if not specified. */
+ public int regexpInstr(String value, String regex) {
+ return regexpInstr(value, regex, 1, 1, 0);
+ }
+
+ /** SQL {@code REGEXP_INSTR(value, regexp, position)} function.
+ * Returns 0 if there is no match, regex is empty, or if position is
beyond range.
+ * Returns an exception if regex or position is invalid.
+ * Uses occurrence=1, occurrencePosition=0 as default value when not
specified. */
+ public int regexpInstr(String value, String regex, int position) {
+ return regexpInstr(value, regex, position, 1, 0);
+ }
+
+ /** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence)} function.
+ * Returns 0 if there is no match, regex is empty, or if position or
occurrence
+ * are beyond range. Returns an exception if regex, position or occurrence
are invalid.
+ * Uses occurrencePosition=0 as default value when not specified. */
+ public int regexpInstr(String value, String regex, int position,
+ int occurrence) {
+ return regexpInstr(value, regex, position, occurrence, 0);
+ }
+
+ /** SQL {@code REGEXP_INSTR(value, regexp, position, occurrence,
occurrencePosition)}
+ * function. Returns 0 if there is no match, regex is empty, or if
position or occurrence
+ * are beyond range. Returns an exception if regex, position, occurrence
+ * or occurrencePosition are invalid. */
+ public int regexpInstr(String value, String regex, int position,
+ int occurrence, int occurrencePosition) {
+ // Uses java.util.regex as a standard for regex processing
+ // in Calcite instead of RE2 used by BigQuery/GoogleSQL
+ final String methodName = "REGEXP_INSTR";
+ final Pattern pattern = validateRegexPattern(regex, methodName);
+
+ if (checkPosOccurrenceParamValues(position, occurrence,
occurrencePosition, value,
+ methodName) || regex.isEmpty()) {
+ return 0;
+ }
+
+ Matcher matcher = pattern.matcher(value);
+ checkMultipleCapturingGroupsInRegex(matcher, methodName);
+ matcher.region(position - 1, value.length());
+
+ int matchIndex = 0;
+ while (occurrence > 0) {
+ if (matcher.find()) {
+ if (occurrencePosition == 0) {
+ matchIndex = matcher.start(matcher.groupCount()) + 1;
+ } else {
+ matchIndex = matcher.end(matcher.groupCount()) + 1;
+ }
+ } else {
+ return 0;
+ }
+ occurrence--;
+ }
+
+ return matchIndex;
+ }
+
/** SQL {@code REGEXP_REPLACE} function with 3 arguments. */
public String regexpReplace(String s, String regex,
String replacement) {
diff --git
a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index 43f5f6b9b4..f850ab1cc0 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -500,6 +500,14 @@ public abstract class SqlLibraryOperators {
OperandTypes.STRING_STRING,
SqlFunctionCategory.STRING);
+ /** The "REGEXP_INSTR(value, regexp [, position[, occurrence,
[occurrence_position]]])" function.
+ * Returns the lowest 1-based position of a regexp in value. Returns NULL if
there is no match. */
+ @LibraryOperator(libraries = {BIG_QUERY})
+ public static final SqlBasicFunction REGEXP_INSTR =
+ SqlBasicFunction.create("REGEXP_INSTR", ReturnTypes.INTEGER_NULLABLE,
+
OperandTypes.STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER,
+ SqlFunctionCategory.STRING);
+
@LibraryOperator(libraries = {MYSQL, ORACLE})
public static final SqlFunction REGEXP_REPLACE = new
SqlRegexpReplaceFunction();
diff --git a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
index 8f77ddedd9..8f1d6facdc 100644
--- a/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
+++ b/core/src/main/java/org/apache/calcite/sql/type/OperandTypes.java
@@ -867,6 +867,12 @@ public abstract class OperandTypes {
ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING,
SqlTypeFamily.INTEGER,
SqlTypeFamily.INTEGER), i -> i == 2 || i == 3);
+ public static final SqlSingleOperandTypeChecker
+ STRING_STRING_OPTIONAL_INTEGER_OPTIONAL_INTEGER_OPTIONAL_INTEGER =
+ family(
+ ImmutableList.of(SqlTypeFamily.STRING, SqlTypeFamily.STRING,
SqlTypeFamily.INTEGER,
+ SqlTypeFamily.INTEGER, SqlTypeFamily.INTEGER), i -> i == 2 || i
== 3 || i == 4);
+
public static final SqlSingleOperandTypeChecker STRING_INTEGER =
family(SqlTypeFamily.STRING, SqlTypeFamily.INTEGER);
diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
index 19c5228e09..4652ca6bab 100644
--- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
+++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
@@ -534,6 +534,14 @@ public enum BuiltInMethod {
String.class, String.class, int.class, int.class),
REGEXP_EXTRACT_ALL(SqlFunctions.RegexFunction.class, "regexpExtractAll",
String.class, String.class),
+ REGEXP_INSTR2(SqlFunctions.RegexFunction.class, "regexpInstr",
+ String.class, String.class),
+ REGEXP_INSTR3(SqlFunctions.RegexFunction.class, "regexpInstr",
+ String.class, String.class, int.class),
+ REGEXP_INSTR4(SqlFunctions.RegexFunction.class, "regexpInstr",
+ String.class, String.class, int.class, int.class),
+ REGEXP_INSTR5(SqlFunctions.RegexFunction.class, "regexpInstr",
+ String.class, String.class, int.class, int.class, int.class),
REGEXP_REPLACE3(SqlFunctions.RegexFunction.class, "regexpReplace",
String.class, String.class, String.class),
REGEXP_REPLACE4(SqlFunctions.RegexFunction.class, "regexpReplace",
diff --git a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
index 52a33b0e8f..f31a182979 100644
--- a/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
+++ b/core/src/test/java/org/apache/calcite/test/SqlFunctionsTest.java
@@ -240,6 +240,7 @@ class SqlFunctionsTest {
assertThat(f.regexpContains("abcdef", "abz*"), is(true));
assertThat(f.regexpContains("zabzz", "abz*"), is(true));
assertThat(f.regexpContains("zazbbzz", "abz*"), is(false));
+ assertThat(f.regexpContains("abcadcabcaecghi", ""), is(true));
try {
final boolean b = f.regexpContains("abc def ghi", "(abc");
@@ -274,6 +275,7 @@ class SqlFunctionsTest {
// basic extracts
assertThat(f.regexpExtract("abcadcabcaecghi", "ac"), nullValue());
+ assertThat(f.regexpExtract("abcadcabcaecghi", ""), is(""));
assertThat(f.regexpExtract("a9cadca5c4aecghi", "a[0-9]c"), is("a9c"));
assertThat(f.regexpExtract("abcadcabcaecghi", "a.*c"), is("abcadcabcaec"));
@@ -342,6 +344,7 @@ class SqlFunctionsTest {
final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
assertThat(f.regexpExtractAll("abcadcabcaecghi", "ac"), is(list()));
+ assertThat(f.regexpExtractAll("abcadc", ""), is(list("", "", "", "", "",
"", "")));
assertThat(f.regexpExtractAll("abcadcabcaecghi", "abc(a.c)"),
is(list("adc", "aec")));
assertThat(f.regexpExtractAll("abcadcabcaecghi", "a.c"), is(list("abc",
"adc", "abc", "aec")));
assertThat(f.regexpExtractAll("banana", "ana"), is(list("ana")));
@@ -367,6 +370,80 @@ class SqlFunctionsTest {
}
}
+ @Test void testRegexpInstr() {
+ final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
+
+ // basic searches
+ assertThat(f.regexpInstr("abcdefghij", "adc"), is(0));
+ assertThat(f.regexpInstr("abcdefghij", ""), is(0));
+ assertThat(f.regexpInstr("a9ca5c4aechi", "a[0-9]c"), is(1));
+ assertThat(f.regexpInstr("abcadcabcaecghi", ".dc"), is(4));
+
+ // capturing group searches
+ assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)"), is(4));
+ assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 4), is(10));
+ assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 1, 2), is(10));
+ assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 1, 2, 1), is(13));
+
+ // position-based searches
+ assertThat(f.regexpInstr("abcadcabcaecghi", ".ec", 25), is(0));
+ assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 4), is(7));
+ assertThat(f.regexpInstr("abcadcabcaecghi", "a.*c", 7), is(7));
+
+ // occurrence-based searches
+ assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 3), is(0));
+ assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 2, 1), is(7));
+ assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 1), is(1));
+
+ // occurrence_position-based searches
+ assertThat(f.regexpInstr("a9cadca5c4aecghi", "a[0-9]c", 1, 1, 0), is(1));
+ assertThat(f.regexpInstr("abcadcabcaecghi", "abc(a.c)", 7, 1, 1), is(13));
+ assertThat(f.regexpInstr("abcadcabcaec", "abc(a.c)", 4, 1, 1), is(13));
+
+ // exceptional scenarios
+ try {
+ final int idx = f.regexpInstr("abc def ghi", "{4,1}");
+ fail("expected error, got " + idx);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Invalid regular expression for REGEXP_INSTR: 'Illegal repetition
range near index 4"
+ + " {4,1} ^'"));
+ }
+
+ try {
+ final int idx = f.regexpInstr("abcadcabcaecghi", "(.)a(.c)");
+ fail("expected error, got " + idx);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Multiple capturing groups (count=2) not allowed in regex input
for "
+ + "REGEXP_INSTR"));
+ }
+
+ try {
+ final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 0);
+ fail("expected error, got " + idx);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Invalid integer input '0' for argument 'position' in
REGEXP_INSTR"));
+ }
+
+ try {
+ final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 3, -1);
+ fail("expected error, got " + idx);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Invalid integer input '-1' for argument 'occurrence' in
REGEXP_INSTR"));
+ }
+
+ try {
+ final int idx = f.regexpInstr("abcadcabcaecghi", "a.c", 2, 4, -4);
+ fail("expected error, got " + idx);
+ } catch (RuntimeException e) {
+ assertThat(e.getMessage(),
+ is("Invalid integer input '-4' for argument 'occurrence_position' in
REGEXP_INSTR"));
+ }
+ }
+
@Test void testRegexpReplace() {
final SqlFunctions.RegexFunction f = new SqlFunctions.RegexFunction();
assertThat(f.regexpReplace("a b c", "b", "X"), is("a X c"));
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 6c0b9c2a63..857445d27f 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -2790,6 +2790,7 @@ BigQuery's type system uses confusingly different names
for types and functions:
| b | REGEXP_CONTAINS(string, regexp) | Returns whether
*string* is a partial match for the *regexp*
| b | REGEXP_EXTRACT(string, regexp [, position [, occurrence]]) | Returns the
substring in *string* that matches the *regexp*, starting search at *position*
(default 1), and until locating the nth *occurrence* (default 1). Returns NULL
if there is no match
| b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all
substrings in *string* that matches the *regexp*. Returns an empty array if
there is no match
+| b | REGEXP_INSTR(string, regexp [, position [, occurrence [,
occurrence_position]]]) | Returns the lowest 1-based position of the substring
in *string* that matches the *regexp*, starting search at *position* (default
1), and until locating the nth *occurrence* (default 1). Setting
occurrence_position (default 0) to 1 returns the end position of substring + 1.
Returns 0 if there is no match
| m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [,
matchType]]]) | Replaces all substrings of *string* that match *regexp* with
*rep* at the starting *pos* in expr (if omitted, the default is 1),
*occurrence* means which occurrence of a match to search for (if omitted, the
default is 1), *matchType* specifies how to perform matching
| b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for
REGEXP_EXTRACT
| b m p | REPEAT(string, integer) | Returns a string
consisting of *string* repeated of *integer* times; returns an empty string if
*integer* is less than 1
diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
index 4e0636be07..e4b774ac59 100644
--- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
+++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
@@ -4666,6 +4666,7 @@ public class SqlOperatorTest {
f.checkBoolean("regexp_contains('11555666442233', '^\\d{10}$')", false);
f.checkBoolean("regexp_contains('55566644221133', '\\d{10}')", true);
f.checkBoolean("regexp_contains('55as56664as422', '\\d{10}')", false);
+ f.checkBoolean("regexp_contains('55as56664as422', '')", true);
f.checkQuery("select regexp_contains('abc def ghi', 'abc')");
f.checkQuery("select regexp_contains('[email protected]',
'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')");
@@ -4729,6 +4730,36 @@ public class SqlOperatorTest {
f.checkQuery("select regexp_extract_all('55as56664as422', '\\d{10}')");
}
+ @Test void testRegexpInstrFunc() {
+ final SqlOperatorFixture f =
+
fixture().setFor(SqlLibraryOperators.REGEXP_INSTR).withLibrary(SqlLibrary.BIG_QUERY);
+
+ f.checkScalar("regexp_instr('abc def ghi', 'def')", 5, "INTEGER NOT NULL");
+ f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 2)", 4, "INTEGER NOT
NULL");
+ f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 1, 3)", 7, "INTEGER NOT
NULL");
+ f.checkScalar("regexp_instr('abcadcaecghi', 'a.c', 1, 3, 1)", 10, "INTEGER
NOT NULL");
+ f.checkScalar("regexp_instr('a9cadca513ca4cecghi', 'a([0-9]+)', 1, 2, 1)",
11,
+ "INTEGER NOT NULL");
+ f.checkScalar("regexp_instr('a9cadca513ca4cecghi', 'a([0-9]*)', 8, 1, 0)",
13,
+ "INTEGER NOT NULL");
+ f.checkScalar("regexp_instr('55as56664as422', '\\d{3}', 3, 2, 0)", 12,
"INTEGER NOT NULL");
+ f.checkScalar("regexp_instr('55as56664as422', '\\d{2}', 2, 2, 1)", 9,
"INTEGER NOT NULL");
+ f.checkScalar("regexp_instr('55as56664as422', '', 2, 2, 1)", 0, "INTEGER
NOT NULL");
+
+ f.checkNull("regexp_instr('abc def ghi', cast(null as varchar))");
+ f.checkNull("regexp_instr(cast(null as varchar), 'abc')");
+ f.checkNull("regexp_instr(cast(null as varchar), cast(null as varchar))");
+ f.checkNull("regexp_instr('abc def ghi', 'abc', cast(null as integer))");
+ f.checkNull("regexp_instr('abc def ghi', 'abc', 1, cast(null as
integer))");
+ f.checkNull("regexp_instr('abc def ghi', 'abc', 1, 3, cast(null as
integer))");
+
+ f.checkQuery("select regexp_instr('abc def ghi', 'abc')");
+ f.checkQuery("select regexp_instr('[email protected]',
'@[a-zA-Z0-9-]+\\\\.[a-zA-Z0-9-.]+')");
+ f.checkQuery("select regexp_instr('55as56664as422', '\\d{10}')");
+ f.checkQuery("select regexp_instr('abcadcabcaecghi', 'c(a.c)', 4)");
+ f.checkQuery("select regexp_instr('a9cadca5c4aecghi', 'a[0-9]c', 1, 3)");
+ }
+
@Test void testRegexpReplaceFunc() {
final SqlOperatorFixture f0 = fixture();
final Consumer<SqlOperatorFixture> consumer = f -> {