At
<https://wiki.postgresql.org/wiki/PostgreSQL_vs_SQL_Standard#Obsolete_syntax_for_substring.28.29>
it is described that the substring pattern matching syntax in PostgreSQL
does not conform to the current standard. PostgreSQL implements
SUBSTRING(text FROM pattern FOR escapechar)
whereas the current standard says
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
The former was in SQL99, but the latter has been there since SQL:2003.
It's pretty easy to implement the second form also, so here is a patch
that does that.
--
Peter Eisentraut http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From e6ab38e476d65f592d8542aac68ab6a23d007668 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Fri, 19 Jun 2020 11:14:10 +0200
Subject: [PATCH 1/2] Clean up grammar a bit
Simplify the grammar specification of substring() and overlay() a bit,
simplify and update some comments.
---
src/backend/parser/gram.y | 73 ++++++++++++---------------------------
1 file changed, 23 insertions(+), 50 deletions(-)
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index e669d75a5a..1a843049f0 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -452,7 +452,6 @@ static Node *makeRecursiveViewSelect(char *relname, List
*aliases, Node *query);
%type <list> extract_list overlay_list position_list
%type <list> substr_list trim_list
%type <list> opt_interval interval_second
-%type <node> overlay_placing substr_from substr_for
%type <str> unicode_normal_form
%type <boolean> opt_instead
@@ -13797,11 +13796,6 @@ func_expr_common_subexpr:
}
| OVERLAY '(' overlay_list ')'
{
- /* overlay(A PLACING B FROM C FOR D) is
converted to
- * overlay(A, B, C, D)
- * overlay(A PLACING B FROM C) is
converted to
- * overlay(A, B, C)
- */
$$ = (Node *)
makeFuncCall(SystemFuncName("overlay"), $3, @1);
}
| POSITION '(' position_list ')'
@@ -14437,63 +14431,45 @@ unicode_normal_form:
| NFKD
{ $$ = "nfkd"; }
;
-/* OVERLAY() arguments
- * SQL99 defines the OVERLAY() function:
- * o overlay(text placing text from int for int)
- * o overlay(text placing text from int)
- * and similarly for binary strings
- */
+/* OVERLAY() arguments */
overlay_list:
- a_expr overlay_placing substr_from substr_for
+ a_expr PLACING a_expr FROM a_expr FOR a_expr
{
- $$ = list_make4($1, $2, $3, $4);
+ /* overlay(A PLACING B FROM C FOR D) is
converted to overlay(A, B, C, D) */
+ $$ = list_make4($1, $3, $5, $7);
}
- | a_expr overlay_placing substr_from
+ | a_expr PLACING a_expr FROM a_expr
{
- $$ = list_make3($1, $2, $3);
+ /* overlay(A PLACING B FROM C) is
converted to overlay(A, B, C) */
+ $$ = list_make3($1, $3, $5);
}
;
-overlay_placing:
- PLACING a_expr
- { $$ = $2; }
- ;
-
/* position_list uses b_expr not a_expr to avoid conflict with general IN */
-
position_list:
b_expr IN_P b_expr
{ $$ = list_make2($3, $1); }
| /*EMPTY*/
{ $$ = NIL; }
;
-/* SUBSTRING() arguments
- * SQL9x defines a specific syntax for arguments to SUBSTRING():
- * o substring(text from int for int)
- * o substring(text from int) get entire string from starting point "int"
- * o substring(text for int) get first "int" characters of string
- * o substring(text from pattern) get entire string matching pattern
- * o substring(text from pattern for escape) same with specified escape char
- * We also want to support generic substring functions which accept
- * the usual generic list of arguments. So we will accept both styles
- * here, and convert the SQL9x style to the generic list for further
- * processing. - thomas 2000-11-28
- */
+/* SUBSTRING() arguments */
substr_list:
- a_expr substr_from substr_for
+ a_expr FROM a_expr FOR a_expr
{
- $$ = list_make3($1, $2, $3);
+ $$ = list_make3($1, $3, $5);
}
- | a_expr substr_for substr_from
+ | a_expr FOR a_expr FROM a_expr
{
- /* not legal per SQL99, but might as
well allow it */
- $$ = list_make3($1, $3, $2);
+ /* not legal per SQL, but might as well
allow it */
+ $$ = list_make3($1, $5, $3);
}
- | a_expr substr_from
+ | a_expr FROM a_expr
{
- $$ = list_make2($1, $2);
+ $$ = list_make2($1, $3);
}
- | a_expr substr_for
+ | a_expr FOR a_expr
{
+ /* not legal per SQL */
+
/*
* Since there are no cases where this
syntax allows
* a textual FOR value, we forcibly
cast the argument
@@ -14504,9 +14480,13 @@ substr_list:
* is unknown or doesn't have an
implicit cast to int4.
*/
$$ = list_make3($1, makeIntConst(1, -1),
-
makeTypeCast($2,
+
makeTypeCast($3,
SystemTypeName("int4"), -1));
}
+ /*
+ * We also want to support generic substring functions
that
+ * accept the usual generic list of arguments.
+ */
| expr_list
{
$$ = $1;
@@ -14515,13 +14495,6 @@ substr_list:
{ $$ = NIL; }
;
-substr_from:
- FROM a_expr
{ $$ = $2; }
- ;
-
-substr_for: FOR a_expr
{ $$ = $2; }
- ;
-
trim_list: a_expr FROM expr_list { $$ =
lappend($3, $1); }
| FROM expr_list
{ $$ = $2; }
| expr_list
{ $$ = $1; }
--
2.27.0
From 32b6d132232d40308c47faad111e75d5c03f6b63 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Fri, 19 Jun 2020 11:14:10 +0200
Subject: [PATCH 2/2] Add current substring regular expression syntax
SQL99 had syntax
SUBSTRING(text FROM pattern FOR escapechar)
but this was replaced in SQL:2003 by the more clear
SUBSTRING(text SIMILAR pattern ESCAPE escapechar)
but this was never implemented in PostgreSQL. This patch adds that
new syntax as an alternative in the parser, and updates documentation
and tests to indicate that this is the preferred alternative now.
---
doc/src/sgml/func.sgml | 20 ++++++++++++-----
src/backend/parser/gram.y | 26 +++++++++++++++++++++-
src/test/regress/expected/strings.out | 31 ++++++++++++++++-----------
src/test/regress/sql/strings.sql | 26 +++++++++++-----------
4 files changed, 73 insertions(+), 30 deletions(-)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 9d71678029..eedf189546 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -2669,15 +2669,21 @@ <title><acronym>SQL</acronym> String Functions and
Operators</title>
<row>
<entry role="func_table_entry"><para role="func_signature">
+ <function>substring</function> ( <parameter>string</parameter>
<type>text</type> <literal>SIMILAR</literal> <parameter>pattern</parameter>
<type>text</type> <literal>ESCAPE</literal> <parameter>escape</parameter>
<type>text</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para role="func_signature">
<function>substring</function> ( <parameter>string</parameter>
<type>text</type> <literal>FROM</literal> <parameter>pattern</parameter>
<type>text</type> <literal>FOR</literal> <parameter>escape</parameter>
<type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Extracts substring matching <acronym>SQL</acronym> regular expression;
- see <xref linkend="functions-similarto-regexp"/>.
+ see <xref linkend="functions-similarto-regexp"/>. The first form has
+ specified since SQL:2003; the second form was only in SQL:1999 and
+ should be considered obsolete.
</para>
<para>
- <literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal>
+ <literal>substring('Thomas' similar '%#"o_a#"_' escape '#')</literal>
<returnvalue>oma</returnvalue>
</para></entry>
</row>
@@ -5160,7 +5166,11 @@ <title><function>SIMILAR TO</function> Regular
Expressions</title>
The <function>substring</function> function with three parameters
provides extraction of a substring that matches an SQL
regular expression pattern. The function can be written according
- to SQL99 syntax:
+ to standard SQL syntax:
+<synopsis>
+substring(<replaceable>string</replaceable> similar
<replaceable>pattern</replaceable> escape
<replaceable>escape-character</replaceable>)
+</synopsis>
+ or using the now obsolete SQL99 syntax:
<synopsis>
substring(<replaceable>string</replaceable> from
<replaceable>pattern</replaceable> for
<replaceable>escape-character</replaceable>)
</synopsis>
@@ -5201,8 +5211,8 @@ <title><function>SIMILAR TO</function> Regular
Expressions</title>
<para>
Some examples, with <literal>#"</literal> delimiting the return
string:
<programlisting>
-substring('foobar' from '%#"o_b#"%' for '#')
<lineannotation>oob</lineannotation>
-substring('foobar' from '#"o_b#"%' for '#')
<lineannotation>NULL</lineannotation>
+substring('foobar' similar '%#"o_b#"%' escape '#')
<lineannotation>oob</lineannotation>
+substring('foobar' similar '#"o_b#"%' escape '#')
<lineannotation>NULL</lineannotation>
</programlisting>
</para>
</sect2>
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1a843049f0..5f21039b11 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -14451,7 +14451,27 @@ position_list:
| /*EMPTY*/
{ $$ = NIL; }
;
-/* SUBSTRING() arguments */
+/*
+ * SUBSTRING() arguments
+ *
+ * Note that SQL99 has both
+ *
+ * text FROM int FOR int
+ *
+ * and
+ *
+ * text FROM pattern FOR escape
+ *
+ * In the parser we map them both to a call to the substring() function and
+ * rely on type resolution to pick the right one.
+ *
+ * In SQL:2003, the second variant was changed to
+ *
+ * text SIMILAR pattern ESCAPE escape
+ *
+ * We could in theory map that to a different function internally, but
+ * since we still support the SQL99 version, we don't.
+ */
substr_list:
a_expr FROM a_expr FOR a_expr
{
@@ -14483,6 +14503,10 @@ substr_list:
makeTypeCast($3,
SystemTypeName("int4"), -1));
}
+ | a_expr SIMILAR a_expr ESCAPE a_expr
+ {
+ $$ = list_make3($1, $3, $5);
+ }
/*
* We also want to support generic substring functions
that
* accept the usual generic list of arguments.
diff --git a/src/test/regress/expected/strings.out
b/src/test/regress/expected/strings.out
index 6e98d183f6..8c034c9599 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS
"456";
(1 row)
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+ bcd
+-----
+ bcd
+(1 row)
+
+-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
bcd
-----
@@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS
"bcd";
(1 row)
-- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
True
------
t
(1 row)
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
True
------
t
(1 row)
-- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
(1 row)
-- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
bcdef
-------
bcdef
(1 row)
-- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
ERROR: SQL regular expression may not contain more than two
escape-double-quote separators
CONTEXT: SQL function "substring" statement 1
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
bcdefg
--------
bcdefg
(1 row)
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
abcdefg
---------
abcdefg
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3e89159a4f..14901a2692 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -132,31 +132,33 @@
SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
-- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+-- obsolete SQL99 syntax
SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
-- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
-- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
-- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
-- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
-- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
-- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
-- substring() with just two arguments is not allowed by SQL spec;
-- we accept it, but we interpret the pattern as a POSIX regexp not SQL
--
2.27.0