My apologies for the links in the head, the email formatting and the missing patch, I accidently send the email too early.
-- Gilles
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index bf99f82149..88e08b40d2 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3097,6 +3097,47 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue> </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>regexp_count</primary> + </indexterm> + <function>regexp_count</function> ( <parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>position</parameter> <type>integer</type> ] [, <parameter>flags</parameter> <type>text</type> ] ) + <returnvalue>integer</returnvalue> + </para> + <para> + Return the number of times a pattern occurs for a match of a POSIX + regular expression to the <parameter>string</parameter>; see + <xref linkend="functions-posix-regexp"/>. + </para> + <para> + <literal>regexp_count('123456789012', '\d{3}', 3)</literal> + <returnvalue>3</returnvalue> + </para></entry> + </row> + + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>regexp_instr</primary> + </indexterm> + <function>regexp_instr</function> ( <parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>position</parameter> <type>integer</type> ] [, <parameter>occurence</parameter> <type>integer</type> ] [, <parameter>returnopt</parameter> <type>integer</type> ] [, <parameter>flags</parameter> <type>text</type> ] [, <parameter>group</parameter> <type>integer</type> ] ) + <returnvalue>integer</returnvalue> + </para> + <para> + Return the position within <parameter>string</parameter> where the + match of a POSIX regular expression occurs. It returns an integer + indicating the beginning or ending position of the matched substring, + depending on the value of the <parameter>returnopt</parameter> argument + (default beginning). If no match is found, then the function returns 0; + see <xref linkend="functions-posix-regexp"/>. + </para> + <para> + <literal>regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4)</literal> + <returnvalue>7</returnvalue> + </para></entry> + </row> + <row> <entry role="func_table_entry"><para role="func_signature"> <indexterm> @@ -3157,6 +3198,24 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue> </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>regexp_substr</primary> + </indexterm> + <function>regexp_substr</function> ( <parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>position</parameter> <type>integer</type> ] [, <parameter>occurence</parameter> <type>integer</type> ] [, <parameter>flags</parameter> <type>text</type> ] [, <parameter>group</parameter> <type>integer</type> ] ) + <returnvalue>text</returnvalue> + </para> + <para> + Return the substring within <parameter>string</parameter> corresponding to the + match of a POSIX regular expression; see <xref linkend="functions-posix-regexp"/>. + </para> + <para> + <literal>regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3)</literal> + <returnvalue>55</returnvalue> + </para></entry> + </row> + <row> <entry role="func_table_entry"><para role="func_signature"> <indexterm> @@ -5295,6 +5354,15 @@ substring('foobar' similar '#"o_b#"%' escape '#') <lineannotation>NULL</linea <indexterm> <primary>regexp_split_to_array</primary> </indexterm> + <indexterm> + <primary>regexp_count</primary> + </indexterm> + <indexterm> + <primary>regexp_instr</primary> + </indexterm> + <indexterm> + <primary>regexp_substr</primary> + </indexterm> <para> <xref linkend="functions-posix-table"/> lists the available @@ -5669,6 +5737,132 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; in practice. Other software systems such as Perl use similar definitions. </para> + <para> + The <function>regexp_count</function> function returns the number of + captured substring(s) resulting from matching a POSIX regular + expression pattern to a string. It has the syntax <function>regexp_replace</function>( + <replaceable>string</replaceable>, <replaceable>pattern</replaceable> <optional>, + <replaceable>position</replaceable> </optional> <optional>, <replaceable>flags</replaceable> </optional>). + <replaceable>pattern</replaceable> is searched in <replaceable>string</replaceable> starting + from an optional <replaceable>position</replaceable> or from the beginning of <replaceable>string</replaceable> + by default. The <replaceable>flags</replaceable> parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + <function>regexp_count</function> accepts all the flags + shown in <xref linkend="posix-embedded-options-table"/>. + The <literal>g</literal> flag is forced internally to count all matches. + This function returns 0 if there is no match or the number of match as + an integer. + </para> + + <para> + Some examples: +<programlisting> +SELECT regexp_count('123123123123', '\d{3}', 1); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('Hello'||CHR(10)||'world!', '^world!$', 1, 'm'); + regexp_count +-------------- + 1 +(1 row) +</programlisting> + </para> + + <para> + The <function>regexp_instr</function> function returns the beginning or ending + position of the matched substring resulting from matching a POSIX regular + expression pattern to a string. It has the syntax <function>regexp_instr</function>( + <replaceable>string</replaceable>, <replaceable>pattern</replaceable> <optional>, + <replaceable>position</replaceable> </optional> <optional>, <replaceable>occurrence</replaceable> </optional> + <optional>, <replaceable>returnopt</replaceable> </optional> + <optional>, <replaceable>flags</replaceable> </optional> + <optional>, <replaceable>group</replaceable> </optional>). + <replaceable>pattern</replaceable> is searched in <replaceable>string</replaceable> starting + from an optional <replaceable>position</replaceable> or from the beginning + of <replaceable>string</replaceable> by default. <replaceable>occurrence</replaceable> + indicates which occurrence of <replaceable>pattern</replaceable> in <replaceable>string</replaceable> + should be searched. When <replaceable>returnopt</replaceable> is set to 0 (default) the function + returns the position of the first character of the occurrence. When set to 1 returns the position + of the character after the occurrence. + The <replaceable>flags</replaceable> parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + <function>regexp_count</function> accepts all the flags + shown in <xref linkend="posix-embedded-options-table"/>. + The <literal>g</literal> flag is forced internally to track all matches. + For a pattern with capture groups, <replaceable>group</replaceable> is an integer indicating + which capture in pattern is the target of the function. A capture group is a part of the pattern + enclosed in parentheses. Capture groups can be nested. They are numbered in order in which their + left parentheses appear in pattern. If <replaceable>group</replaceable> is zero, then the position + of the entire substring that matches the pattern is returned. If <replaceable>pattern</replaceable> + does not have at least <replaceable>group</replaceable> capture group, the function returns zero. + This function returns 0 if there is no match or the starting or ending position + of match as an integer. + </para> + + <para> + Some examples: +<programlisting> +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 1, 6); + regexp_instr +-------------- + 32 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); + regexp_instr +-------------- + 5 +(1 row) +</programlisting> + </para> + + <para> + The <function>regexp_substr</function> function returns the + matched substring resulting from matching a POSIX regular + expression pattern to a string. It has the syntax <function>regexp_substr</function>( + <replaceable>string</replaceable>, <replaceable>pattern</replaceable> <optional>, + <replaceable>position</replaceable> </optional> <optional>, <replaceable>occurrence</replaceable> </optional> + <optional>, <replaceable>flags</replaceable> </optional> + <optional>, <replaceable>group</replaceable> </optional>). + <replaceable>pattern</replaceable> is searched in <replaceable>string</replaceable> starting + from an optional <replaceable>position</replaceable> or from the beginning + of <replaceable>string</replaceable> by default. <replaceable>occurrence</replaceable> + indicates which occurrence of <replaceable>pattern</replaceable> in <replaceable>string</replaceable> + should be searched. The <replaceable>flags</replaceable> parameter is an optional text string + containing zero or more single-letter flags that change the function's behavior. + <function>regexp_count</function> accepts all the flags + shown in <xref linkend="posix-embedded-options-table"/>. + The <literal>g</literal> flag is forced internally to track all matches. + For a pattern with capture groups, <replaceable>group</replaceable> is an integer indicating + which capture in pattern is the target of the function. A capture group is a part of the pattern + enclosed in parentheses. Capture groups can be nested. They are numbered in order in which their + left parentheses appear in pattern. If <replaceable>group</replaceable> is zero, then the position + of the entire substring that matches the pattern is returned. If <replaceable>pattern</replaceable> + does not have at least <replaceable>group</replaceable> capture group, the function returns zero. + This function returns NULL if there is no match or the substring of match. + </para> + + <para> + Some examples: +<programlisting> +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 2); + regexp_substr +--------------- + , FR +(1 row) + +SELECT regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3); + regexp_substr +--------------- + 55 +(1 row) +</programlisting> + </para> + + <!-- derived from the re_syntax.n man page --> <sect3 id="posix-syntax-details"> diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index a32c5c82ab..42c25cdd75 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -120,6 +120,7 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); +static text *enclose_with_parenthesis(text *str); /* * RE_compile_and_cache - compile a RE, caching if possible @@ -555,7 +556,6 @@ texticregexne(PG_FUNCTION_ARGS) 0, NULL)); } - /* * textregexsubstr() * Return a substring matched by a regular expression. @@ -1063,6 +1063,350 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) return regexp_matches(fcinfo); } +/* + * regexp_count() + * Return the number of matches of a pattern within a string. + */ +Datum +regexp_count(PG_FUNCTION_ARGS) +{ + text *orig_str = NULL; + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(3); + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + if (PG_NARGS() > 2) + start = PG_GETARG_INT32(2); + + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("negative start position not allowed."))); + + /* regexp_count(string, pattern, start[, flags]) */ + if (start > 1) + orig_str = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + orig_str = PG_GETARG_TEXT_PP(0); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + /* this function require flag 'g' */ + re_flags.glob = true; + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + PG_RETURN_INT32(matchctx->nmatches); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_count_no_start(PG_FUNCTION_ARGS) +{ + return regexp_count(fcinfo); +} + + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_count_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_count(fcinfo); +} + +/* + * Return text string between parenthesis + */ +static text * +enclose_with_parenthesis(text *str) +{ + int len = VARSIZE_ANY_EXHDR(str); + text *result; + char *ptr; + + result = palloc(len + VARHDRSZ + 2); + SET_VARSIZE(result, len + VARHDRSZ + 2); + ptr = VARDATA(result); + memcpy(ptr, "(", 1); + memcpy(ptr+1, VARDATA_ANY(str), len); + memcpy(ptr+len+1, ")", 1); + + return result; +} + +/* + * regexp_instr() + * Return the position within the string where the match was located + */ +Datum +regexp_instr(PG_FUNCTION_ARGS) +{ + text *orig_str = NULL; + text *pattern = NULL; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + int start = 1; + int occurrence = 1; + int return_opt = 0; + int subexpr = 0; + int pos; + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* regexp_instr(string, pattern, start) */ + if (PG_NARGS() > 2) + start = PG_GETARG_INT32(2); + + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("negative start position not allowed."))); + + if (start > 1) + orig_str = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + /* regexp_instr(string, pattern) */ + orig_str = PG_GETARG_TEXT_PP(0); + + if (orig_str == NULL) + PG_RETURN_NULL(); + + /* regexp_instr(string, pattern, start, occurrence) */ + if (PG_NARGS() > 3) + occurrence = PG_GETARG_INT32(3); + + if (occurrence <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("occurrence value must be greater than 0."))); + + /* regexp_instr(string, pattern, start, occurrence, return_opt) */ + if (PG_NARGS() > 4) + return_opt = PG_GETARG_INT32(4); + + if (return_opt != 0 && return_opt != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("return option value must be 0 or 1."))); + + /* regexp_instr(string, pattern, start, occurrence, return_opt, flags, subexpr) */ + if (PG_NARGS() > 6) + subexpr = PG_GETARG_INT32(6); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of capture group must be a positive value."))); + + /* + * If subexpr is zero (default), then the position of the entire + * substring that matches the pattern is returned. Otherwise we + * will exactly register the subexpressions given in the pattern. + * Enclose pattern between parenthesis to register the position + * of the entire substring. + */ + pattern = enclose_with_parenthesis(PG_GETARG_TEXT_PP(1)); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* this function require flag 'g' */ + re_flags.glob = true; + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + /* If no match is found, then the function returns 0 */ + if (matchctx->nmatches == 0) + PG_RETURN_INT32(0); + + /* When occurrence exceed matches return 0 */ + if (occurrence > matchctx->nmatches) + PG_RETURN_INT32(0); + + /* When subexpr exceed number of subexpression return 0 */ + if (subexpr > matchctx->npatterns - 1) + PG_RETURN_INT32(0); + + /* + * Returns the position of the first character of the occurrence + * or for subexpression in this occurrence. + */ + pos = (occurrence*matchctx->npatterns*2)-((matchctx->npatterns-subexpr)*2); + if (return_opt == 1) + pos += 1; + + PG_RETURN_INT32(matchctx->match_locs[pos]+start); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_start(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_occurrence(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_return_opt(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_instr_no_subexpr(PG_FUNCTION_ARGS) +{ + return regexp_instr(fcinfo); +} + +/* + * regexp_substr() + * Return the substring within the string that match a regular + * expression pattern + */ +Datum +regexp_substr(PG_FUNCTION_ARGS) +{ + text *orig_str = NULL; + text *pattern = NULL; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(4); + int start = 1; + int occurrence = 1; + int subexpr = 0; + int so, eo, pos; + + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + if (PG_NARGS() > 2) + start = PG_GETARG_INT32(2); + + if (start < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("negative start position not allowed."))); + + /* regexp_substr(string, pattern, start) */ + if (start > 1) + orig_str = DatumGetTextPP(DirectFunctionCall2(text_substr_no_len, + PG_GETARG_DATUM(0), + Int32GetDatum(start) + )); + else + /* regexp_substr(string, pattern) */ + orig_str = PG_GETARG_TEXT_PP(0); + + if (orig_str == NULL) + PG_RETURN_NULL(); + + /* regexp_substr(string, pattern, start, occurrence) */ + if (PG_NARGS() > 3) + occurrence = PG_GETARG_INT32(3); + if (occurrence <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("occurrence value must be greater than 0."))); + + /* regexp_substr(string, pattern, start, occurrence, flags, subexpr) */ + if (PG_NARGS() > 5) + subexpr = PG_GETARG_INT32(5); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of capture group must be a positive value."))); + + /* + * If subexpr is zero (default), then the position of the entire + * substring that matches the pattern is returned. Otherwise we + * will exactly register the subexpressions given in the pattern. + * Enclose pattern between parenthesis to register the position + * of the entire substring. + */ + pattern = enclose_with_parenthesis(PG_GETARG_TEXT_PP(1)); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* this function require flag 'g' */ + re_flags.glob = true; + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false, false); + + /* If no match is found, then the function returns NULL */ + if (matchctx->nmatches == 0) + PG_RETURN_NULL(); + + /* When occurrence exceed matches return NULL */ + if (occurrence > matchctx->nmatches) + PG_RETURN_NULL(); + + /* When subexpr exceed number of subexpression return NULL */ + if (subexpr > matchctx->npatterns - 1) + PG_RETURN_NULL(); + + /* Returns the substring corresponding to the occurrence. */ + pos = (occurrence*matchctx->npatterns*2)-((matchctx->npatterns-subexpr)*2); + so = matchctx->match_locs[pos]+1; + eo = matchctx->match_locs[pos+1]+1; + + PG_RETURN_DATUM(DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so), + Int32GetDatum(eo - so))); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_start(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_occurrence(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_substr_no_subexpr(PG_FUNCTION_ARGS) +{ + return regexp_substr(fcinfo); +} + /* * setup_regexp_matches --- do the initial matching for regexp_match * and regexp_split functions diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 3d3974f467..aae9226ec5 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3557,6 +3557,48 @@ proname => 'regexp_matches', prorows => '10', proretset => 't', prorettype => '_text', proargtypes => 'text text text', prosrc => 'regexp_matches' }, +{ oid => '9614', descr => 'count match(es) for regexp', + proname => 'regexp_count', prorettype => 'int4', proargtypes => 'text text', + prosrc => 'regexp_count_no_start' }, +{ oid => '9615', descr => 'count match(es) for regexp', + proname => 'regexp_count', prorettype => 'int4', proargtypes => 'text text int4', + prosrc => 'regexp_count_no_flags' }, +{ oid => '9616', descr => 'count match(es) for regexp', + proname => 'regexp_count', prorettype => 'int4', + proargtypes => 'text text int4 text', prosrc => 'regexp_count' }, +{ oid => '9617', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text', + prosrc => 'regexp_instr_no_start' }, +{ oid => '9618', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4', + prosrc => 'regexp_instr_no_occurrence' }, +{ oid => '9619', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4', + prosrc => 'regexp_instr_no_return_opt' }, +{ oid => '9620', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4 int4', + prosrc => 'regexp_instr_no_flags' }, +{ oid => '9621', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4 int4 text', + prosrc => 'regexp_instr_no_subexpr' }, +{ oid => '9622', descr => 'position where the match for regexp was located', + proname => 'regexp_instr', prorettype => 'int4', proargtypes => 'text text int4 int4 int4 text int4', + prosrc => 'regexp_instr' }, +{ oid => '9623', descr => 'substring that match the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text', + prosrc => 'regexp_substr_no_start' }, +{ oid => '9624', descr => 'substring that match the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4', + prosrc => 'regexp_substr_no_occurrence' }, +{ oid => '9625', descr => 'substring that match the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4 int4', + prosrc => 'regexp_substr_no_flags' }, +{ oid => '9626', descr => 'substring that match the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4 int4 text', + prosrc => 'regexp_substr_no_subexpr' }, +{ oid => '9627', descr => 'substring that match the regexp pattern', + proname => 'regexp_substr', prorettype => 'text', proargtypes => 'text text int4 int4 text int4', + prosrc => 'regexp_substr' }, { oid => '2088', descr => 'split string by field_sep and return field_num', proname => 'split_part', prorettype => 'text', proargtypes => 'text text int4', prosrc => 'split_part' }, diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index fb4573d85f..99372be6f0 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -905,6 +905,393 @@ SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ov ERROR: regexp_split_to_table() does not support the "global" option SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g'); ERROR: regexp_split_to_array() does not support the "global" option +-- count all matches for regexp +SELECT regexp_count('123123123123123', '(12)3'); + regexp_count +-------------- + 5 +(1 row) + +-- count all matches with start position +SELECT regexp_count('123123123123', '123', 0); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('123123123123', '123', 1); + regexp_count +-------------- + 4 +(1 row) + +SELECT regexp_count('123123123123', '123', 3); + regexp_count +-------------- + 3 +(1 row) + +SELECT regexp_count('123123123123', '123', -3); +ERROR: negative start position not allowed. +-- count all matches in NULL string with a start position +SELECT regexp_count(NULL, '123', 3); + regexp_count +-------------- + \N +(1 row) + +-- count all matches with a start position greater than string length +SELECT regexp_count('123', '123', 10); + regexp_count +-------------- + 0 +(1 row) + +-- count all matches from different regexp +SELECT regexp_count('ABC123', '[A-Z]'), regexp_count('A1B2C3', '[A-Z]'); + regexp_count | regexp_count +--------------+-------------- + 3 | 3 +(1 row) + +SELECT regexp_count('ABC123', '[A-Z][0-9]'), regexp_count('A1B2C3', '[A-Z][0-9]'); + regexp_count | regexp_count +--------------+-------------- + 1 | 3 +(1 row) + +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C3', '[A-Z][0-9]{2}'); + regexp_count | regexp_count +--------------+-------------- + 1 | 0 +(1 row) + +SELECT regexp_count('ABC123', '([A-Z][0-9]){2}'), regexp_count('A1B2C3', '([A-Z][0-9]){2}'); + regexp_count | regexp_count +--------------+-------------- + 0 | 1 +(1 row) + +SELECT regexp_count('ABC123A5', '^[A-Z][0-9]'), regexp_count('A1B2C3', '^[A-Z][0-9]'); + regexp_count | regexp_count +--------------+-------------- + 0 | 1 +(1 row) + +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C34', '[A-Z][0-9]{2}'); + regexp_count | regexp_count +--------------+-------------- + 1 | 1 +(1 row) + +-- count matches with newline case insensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d'); + regexp_count +-------------- + 1 +(1 row) + +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 's'); + regexp_count +-------------- + 1 +(1 row) + +-- count matches with newline case sensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 'n'); + regexp_count +-------------- + 0 +(1 row) + +-- count not multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$'); + regexp_count +-------------- + 0 +(1 row) + +-- count multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$', 1, 'm'); + regexp_count +-------------- + 1 +(1 row) + +SELECT regexp_count('Hello'||CHR(10)||'world!', '^world!$', 1, 'm'); -- 1 + regexp_count +-------------- + 1 +(1 row) + +DROP TABLE IF EXISTS regexp_temp; +NOTICE: table "regexp_temp" does not exist, skipping +CREATE TABLE regexp_temp(fullname varchar(20), email varchar(20)); +INSERT INTO regexp_temp (fullname, email) VALUES ('John Doe', 'john...@example.com'); +INSERT INTO regexp_temp (fullname, email) VALUES ('Jane Doe', 'janedoe'); +-- count matches case sensitive +SELECT fullname, regexp_count(fullname, 'e', 1, 'c') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 2 +(2 rows) + +SELECT fullname, regexp_count(fullname, 'D', 1, 'c') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 1 +(2 rows) + +SELECT fullname, regexp_count(fullname, 'd', 1, 'c') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 0 + Jane Doe | 0 +(2 rows) + +-- count matches case insensitive +SELECT fullname, regexp_count(fullname, 'E', 1, 'i') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 2 +(2 rows) + +SELECT fullname, regexp_count(fullname, 'do', 1, 'i') FROM regexp_temp; + fullname | regexp_count +----------+-------------- + John Doe | 1 + Jane Doe | 1 +(2 rows) + +-- return the start position of the 6th occurence starting at beginning of the string +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 1, 6); + regexp_instr +-------------- + 32 +(1 row) + +-- return the start position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); + regexp_instr +-------------- + 32 +(1 row) + +-- return the ending position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); + regexp_instr +-------------- + 32 +(1 row) + +-- return the ending position of the 2nd occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[T|Z|S][[:alpha:]]{5}', 7, 2, 1, 'i'); + regexp_instr +-------------- + 30 +(1 row) + +-- return the starting position corresponding to the different capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 0); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 1); + regexp_instr +-------------- + 1 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 2); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); + regexp_instr +-------------- + 5 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4); + regexp_instr +-------------- + 7 +(1 row) + +-- return the starting position corresponding to a non existant capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 5); + regexp_instr +-------------- + 0 +(1 row) + +-- Same but with the ending position +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 0); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 1); + regexp_instr +-------------- + 4 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 2); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 3); + regexp_instr +-------------- + 7 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 4); + regexp_instr +-------------- + 9 +(1 row) + +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 5); + regexp_instr +-------------- + 0 +(1 row) + +-- start position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+') "valid_email" FROM regexp_temp; + email | valid_email +---------------------+------------- + john...@example.com | 1 + janedoe | 0 +(2 rows) + +-- ending position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1) "valid_email" FROM regexp_temp; + email | valid_email +---------------------+------------- + john...@example.com | 20 + janedoe | 0 +(2 rows) + +-- start position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 0, 'i', 1) FROM regexp_temp; + email | regexp_instr +---------------------+-------------- + john...@example.com | 16 + janedoe | 0 +(2 rows) + +-- ending position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1, 'i', 1) FROM regexp_temp; + email | regexp_instr +---------------------+-------------- + john...@example.com | 20 + janedoe | 0 +(2 rows) + +-- test negative values +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', -1, 1, 1, 'i', 1); +ERROR: negative start position not allowed. +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, -1, 1, 'i', 1); +ERROR: occurrence value must be greater than 0. +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, -1, 'i', 1); +ERROR: return option value must be 0 or 1. +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', -1); +ERROR: number of capture group must be a positive value. +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+'); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 24); + regexp_substr +--------------- + , FR +(1 row) + +-- return the substring matching the regexp at the first occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 1); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- return the substring matching the regexp at the second occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 2); + regexp_substr +--------------- + , FR +(1 row) + +-- case sensitivity substring search +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1); + regexp_substr +--------------- + \N +(1 row) + +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i'); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- case sensitivity substring search with no capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- case sensitivity substring search with non existing capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); + regexp_substr +---------------- + , zipcode town +(1 row) + +-- return the substring matching the regexp at different occurrence and capture group +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 4); + regexp_substr +--------------- + 78 +(1 row) + +SELECT regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3); + regexp_substr +--------------- + 55 +(1 row) + +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 0); + regexp_substr +--------------- + 12345678 +(1 row) + +-- test negative values +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', -1, 1, 'i', 4); +ERROR: negative start position not allowed. +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, -1, 'i', 4); +ERROR: occurrence value must be greater than 0. +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', -4); +ERROR: number of capture group must be a positive value. +DROP TABLE IF EXISTS regexp_temp; -- change NULL-display back \pset null '' -- E021-11 position expression diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 57a48c9d0b..30d44cc6da 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -255,6 +255,110 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g') AS foo; SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g'); +-- count all matches for regexp +SELECT regexp_count('123123123123123', '(12)3'); +-- count all matches with start position +SELECT regexp_count('123123123123', '123', 0); +SELECT regexp_count('123123123123', '123', 1); +SELECT regexp_count('123123123123', '123', 3); +SELECT regexp_count('123123123123', '123', -3); +-- count all matches in NULL string with a start position +SELECT regexp_count(NULL, '123', 3); +-- count all matches with a start position greater than string length +SELECT regexp_count('123', '123', 10); +-- count all matches from different regexp +SELECT regexp_count('ABC123', '[A-Z]'), regexp_count('A1B2C3', '[A-Z]'); +SELECT regexp_count('ABC123', '[A-Z][0-9]'), regexp_count('A1B2C3', '[A-Z][0-9]'); +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C3', '[A-Z][0-9]{2}'); +SELECT regexp_count('ABC123', '([A-Z][0-9]){2}'), regexp_count('A1B2C3', '([A-Z][0-9]){2}'); +SELECT regexp_count('ABC123A5', '^[A-Z][0-9]'), regexp_count('A1B2C3', '^[A-Z][0-9]'); +SELECT regexp_count('ABC123', '[A-Z][0-9]{2}'), regexp_count('A1B2C34', '[A-Z][0-9]{2}'); +-- count matches with newline case insensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d'); +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 's'); +-- count matches with newline case sensivity +SELECT regexp_count('a'||CHR(10)||'d', 'a.d', 1, 'n'); +-- count not multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$'); +-- count multiline matches +SELECT regexp_count('a'||CHR(10)||'d', '^d$', 1, 'm'); +SELECT regexp_count('Hello'||CHR(10)||'world!', '^world!$', 1, 'm'); -- 1 + +DROP TABLE IF EXISTS regexp_temp; +CREATE TABLE regexp_temp(fullname varchar(20), email varchar(20)); +INSERT INTO regexp_temp (fullname, email) VALUES ('John Doe', 'john...@example.com'); +INSERT INTO regexp_temp (fullname, email) VALUES ('Jane Doe', 'janedoe'); +-- count matches case sensitive +SELECT fullname, regexp_count(fullname, 'e', 1, 'c') FROM regexp_temp; +SELECT fullname, regexp_count(fullname, 'D', 1, 'c') FROM regexp_temp; +SELECT fullname, regexp_count(fullname, 'd', 1, 'c') FROM regexp_temp; +-- count matches case insensitive +SELECT fullname, regexp_count(fullname, 'E', 1, 'i') FROM regexp_temp; +SELECT fullname, regexp_count(fullname, 'do', 1, 'i') FROM regexp_temp; + +-- return the start position of the 6th occurence starting at beginning of the string +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 1, 6); +-- return the start position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); +-- return the ending position of the 5th occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[^ ]+', 7, 5, 0); +-- return the ending position of the 2nd occurence starting after the first word +SELECT regexp_instr('number of your street, zipcode thetown, FR', '[T|Z|S][[:alpha:]]{5}', 7, 2, 1, 'i'); +-- return the starting position corresponding to the different capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 0); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 2); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 3); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 4); +-- return the starting position corresponding to a non existant capture group +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 0, 'i', 5); +-- Same but with the ending position +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 0); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 2); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 3); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 4); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', 5); +-- start position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+') "valid_email" FROM regexp_temp; +-- ending position of a valid email +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1) "valid_email" FROM regexp_temp; +-- start position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 0, 'i', 1) FROM regexp_temp; +-- ending position of first capture group in the email (the dot part) +SELECT email, regexp_instr(email, '\w+@\w+(\.\w+)+', 1, 1, 1, 'i', 1) FROM regexp_temp; +-- test negative values +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', -1, 1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, -1, 1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, -1, 'i', 1); +SELECT regexp_instr('1234567890', '(123)(4(56)(78))', 1, 1, 1, 'i', -1); + +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+'); +-- return the substring matching the regexp +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 24); +-- return the substring matching the regexp at the first occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 1); +-- return the substring matching the regexp at the second occurrence +SELECT regexp_substr('number of your street, zipcode town, FR', ',[^,]+', 1, 2); +-- case sensitivity substring search +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1); +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i'); +-- case sensitivity substring search with no capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); +-- case sensitivity substring search with non existing capture group +SELECT regexp_substr('number of your street, zipcode town, FR', ',\s+[Zf][^,]+', 1, 1, 'i', 0); +-- return the substring matching the regexp at different occurrence and capture group +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 4); +SELECT regexp_substr('1234567890 1234557890', '(123)(4(5[56])(78))', 1, 2, 'i', 3); +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', 0); +-- test negative values +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', -1, 1, 'i', 4); +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, -1, 'i', 4); +SELECT regexp_substr('1234567890 1234567890', '(123)(4(56)(78))', 1, 1, 'i', -4); + +DROP TABLE IF EXISTS regexp_temp; + -- change NULL-display back \pset null ''